def monitor_main_proc(**conf): host_add_queue = dict() host_del_queue = dict() monitor = dict() hosts = dict() rangr = MagicShow(**conf['database']) current_server = rangr.load(conf['totalserver']).show() cpuNum = multiprocessing.cpu_count() # process number, times of cpu count. n = 2 # batch servers and put them into right queue. # cause AgentMonitor need maintain many long time connections, # structure is a bit difference from http and tcp monitoris. for i in xrange(n * cpuNum): host_add_queue[i] = multiprocessing.Queue() host_del_queue[i] = multiprocessing.Queue() monitor[i] = AgentMonitor(host_add_queue[i], host_del_queue[i], **conf) monitor[i].start() hosts[i] = [current_server[x] for x in xrange(len(current_server)) if x % (n * cpuNum) == i] host_add_queue[i].put(hosts[i]) # start http monitor HTTPMonitor(**conf).start() # start tcp monitor TCPMonitor(**conf).start() while True: # read servers from asset2. try: time_stamp = strftime('%Y-%m-%d %H:%M:%S', localtime()) print time_stamp, 'start to load host' # sometimes t.a.wandoulabs.com down! load_host(conf['database']) except: sleep(30) continue else: time_stamp = strftime('%Y-%m-%d %H:%M:%S', localtime()) print time_stamp, 'finish load host' new_server = rangr.load(conf['totalserver']).show() if new_server != current_server: del_server = list(set(current_server).difference(set(new_server))) add_server = list(set(new_server).difference(set(current_server))) for server in del_server: for i in hosts: # put deleted server in del queue. if server in hosts[i]: print time_stamp, 'delete server in hosts[%s]' % i host_del_queue[i].put([server]) hosts[i] = [x for x in hosts[i] if x != server] for server in add_server: # sort as number of servers in each process. sort_hosts = sorted(hosts.iteritems(), key=lambda x:len(x[1])) num = sort_hosts[0][0] print time_stamp, 'add server in hosts[%s]' % num host_add_queue[num].put([server]) hosts[num].append(server) current_server = new_server sleep(60)
def __init__(self, **conf): # save yaml conf. self.conf = conf # redis handler self.ranger = MagicShow(**conf['database']) self.alertr = RedisClient(db='alert', **conf['database']) self.hostr = RedisClient(db='hosts', **conf['database']) # blacklist manager. self.bm = BlackManager(**conf['database']) self.alarm_info = dict() # alarm times self.alarm_times = dict() self.conf_info = dict() self.data_info = dict() self.time_stamp = dict() self.time_stamp['check_time'] = dict() self.time_stamp['alarm_time'] = dict() # which ones are being checked self.alarm_check = dict() # self.alive = True # self.check_enqueue = multiprocessing.Queue() self.check_dequeue = multiprocessing.Queue() self.alarm_enqueue = multiprocessing.Queue() self.alarm_dequeue = multiprocessing.Queue() # self.procs = dict() # create xmpp object self.xmpp = XMPPBOT(address=("125.39.223.167",5222), **conf['database']) self.xmpp.start() self.xmpp.enter_room()
def __init__(self, add_queue, del_queue, **conf): multiprocessing.Process.__init__(self) self.conf = conf self.ranger = MagicShow(**self.conf['database']) self.alertr = RedisClient(db='alert', **self.conf['database']) # save all servers of this process. self.total_server = list() # queue for add new servers. self.add_queue = add_queue # queue for del deleted servers. self.del_queue = del_queue # vars for loop. self.alive = True # vars for saving client data. self.data_info = dict() # vars for saveing socket. self.connections = dict() # save fd's sockert. self.filenos = dict() # save some timestamp. self.timestamps = dict() # save new coming data. self.requests = dict() # epoll object. self.epoll = select.epoll() # save each host's configs. self.host_configs = dict()
class AlarmMain(object): def __init__(self, **conf): # save yaml conf. self.conf = conf # redis handler self.ranger = MagicShow(**conf['database']) self.alertr = RedisClient(db='alert', **conf['database']) self.hostr = RedisClient(db='hosts', **conf['database']) # blacklist manager. self.bm = BlackManager(**conf['database']) self.alarm_info = dict() # alarm times self.alarm_times = dict() self.conf_info = dict() self.data_info = dict() self.time_stamp = dict() self.time_stamp['check_time'] = dict() self.time_stamp['alarm_time'] = dict() # which ones are being checked self.alarm_check = dict() # self.alive = True # self.check_enqueue = multiprocessing.Queue() self.check_dequeue = multiprocessing.Queue() self.alarm_enqueue = multiprocessing.Queue() self.alarm_dequeue = multiprocessing.Queue() # self.procs = dict() # create xmpp object self.xmpp = XMPPBOT(address=("125.39.223.167",5222), **conf['database']) self.xmpp.start() self.xmpp.enter_room() def get_conf(self): conf_info = dict() for conf_type in ['client', 'tcp', 'http']: policy_configs = self.alertr.hgetall('total:%s:policy:configs' % conf_type) collector_configs = self.alertr.hgetall('total:%s:collector:configs' % conf_type) for item, policy_conf in policy_configs.items(): if 'collector' not in policy_conf:continue try: policy_conf = json.loads(policy_conf) except Exception as e: continue collector_name = policy_conf['collector'] target = policy_conf.pop('target') if collector_name in collector_configs: collector_conf = collector_configs[collector_name] else: continue try: conf = json.loads(collector_conf) except Exception as e: continue else: collector_interval = conf['interval'] policy_conf['interval'] = collector_interval targets = self.ranger.load(target).show() policy_conf['target'] = targets policy_conf['type'] = conf['type'] if conf_type == 'TCP' or conf_type == 'tcp': policy_conf['port'] = conf['port'] uniq_key = '%s|%s' % (item, conf_type) conf_info[uniq_key] = policy_conf if uniq_key not in self.alarm_info: self.alarm_info[uniq_key] = dict() if uniq_key not in self.alarm_times: self.alarm_times[uniq_key] = dict() return conf_info def init_process(self): cpu = multiprocessing.cpu_count() for i in xrange(cpu*2): proc = Process(target=check_data_proc, args=(self.check_enqueue, self.check_dequeue)) self.procs[proc] = 1 proc.start() for i in xrange(cpu*2): proc = Process(target=alarm_data_proc, args=(self.alarm_enqueue, self.alarm_dequeue)) self.procs[proc] = 1 proc.start() def get_data(self): data_info = dict() for item, conf in self.conf_info.items(): result_key = "result:%s" % conf['collector'] if not self.alertr.exists(result_key):continue result_data = self.alertr.hgetall(result_key) # too slow if determine, just leave to process. self.conf_info[item]['result'] = result_data def save_data(self, item, data): self.alarm_info[item] = data def send_alarm(self, item, data): email_error = dict() smess_error = dict() xmpp_error = dict() total_error = list() for k, v in data.items(): for info in v: template = info['template'] match = re.compile('\$(\w+)\$', re.I) while True: whether_match = match.search(template) if whether_match: key = whether_match.groups(0)[0] if key in info: template = template.replace('$'+key+'$', str(info[key]), 1) else: template = re.sub(',?$'+key+'$,?', '', template, 1) else:break template = re.sub("\n", "", template) real_key, host, name, groups = info['real_key'], info['host'], info['name'], info['group'] host_info, cur_time = self.hostr.hgetall(host), strftime('%H:%M:%S', localtime()) emessage = '%s|%s|%s|%s|%s'%(k, template, host_info['production'], host_info['idc'], \ host_info['service']) if host_info else '%s|%s|%s|%s|%s'%(k, template,'','','') xmpp_mess = '[%s]%s -- 机房:%s, 产品线:%s, 服务:%s 报警时间:%s'\ %(k, template, host_info['idc'], host_info['production'], host_info['service'], cur_time)\ if host_info else '[%s]%s -- 报警时间:%s' % (k, template, cur_time) message = '[%s]%s' % (k, template.replace(host, host.split('?')[0])) if 'message' in info and len(info['message']) >= 100: message = message.encode().replace(info['message'],'(output too long,read email)', 1) level = str(info['level']) readable_time = strftime('%Y-%m-%d %H:%M:%S') if self.bm.check(host,name): record = '%s|%s' % (emessage, 'is_black') total_error.append(record) continue if k == 'ALARM': record = '%s|%s' % (emessage, 'is_not_black') total_error.append(record) # didn't alarm but recovery if host not in self.alarm_times[item]: self.alarm_times[item][host] = dict() if real_key not in self.alarm_times[item][host]: self.alarm_times[item][host][real_key] = dict() if level not in self.alarm_times[item][host][real_key]: self.alarm_times[item][host][real_key][level] = 0 if self.alarm_times[item][host][real_key][level] >= info['limit'] and \ info['limit'] != 0: print "%s * message:%s, limitation:%s" % (readable_time, message, info['limit']) continue elif k == 'RECOVERY': #recovery but delete try: del self.alarm_info[item][host][real_key] except: pass try: if level in self.alarm_times[item][host][real_key]: del self.alarm_times[item][host][real_key] else: continue except: continue mobiles = list() emails = list() xmpps = list() for group in groups: group_info = self.alertr.get('group:'+group) if not group_info: break try: group_info = json.loads(group_info) except Exception as e: print "Error Exception:%s, Group:%s" % (e, group) continue mobile = group_info['mobile'] if 'mobile' in group_info else [] email = group_info['email'] if 'email' in group_info else [] xmpp = group_info['xmpp'] if 'xmpp' in group_info else [] for m in mobile: if m not in mobiles:mobiles.append(m) for e in email: if e not in emails:emails.append(e) for x in xmpp: if x not in xmpps:xmpps.append(x) print "%s * message:%s, method:%s" % (readable_time, xmpp_mess, info['method']) if self.check_alarm_time(info['period']): # if real_key in self.alarm_info means alarm again if k == "ALARM": self.alarm_times[item][host][real_key][level] += 1 method = info['method'] if 'email' in method: for e in emails: if e not in email_error: email_error[e] = dict() if k not in email_error[e]: email_error[e][k] = dict() email_error[e][k][emessage] = 1 if 'sms' in method: for m in mobiles: if m not in smess_error: smess_error[m] = dict() smess_error[m][message] = 1 if 'xmpp' in method: for x in xmpps: if x not in xmpp_error: xmpp_error[x] = dict() xmpp_error[x][xmpp_mess] = 1 #if policy deleted, delete record in total:alarm:errors if not total_error: self.alertr.hdel('total:alarm:errors', item) else: self.alertr.hset('total:alarm:errors', item, json.dumps(total_error)) if email_error: try:send_email(email_error) except Exception as e: print str(e) if smess_error: try:send_smess(smess_error) except Exception as e: print str(e) if xmpp_error: try:send_xmpp(self.xmpp, xmpp_error) except Exception as e: print str(e) def clean_memory(self): curr_items = self.alertr.hkeys('total:alarm:errors') for item in [x for x in curr_items if x not in self.alarm_info or not self.alarm_info[x]]: self.alertr.hdel('total:alarm:errors', item) # delete timstamps self.time_stamp['alarm_time'] = dict((k, v) for k, v in self.time_stamp['alarm_time'].items()\ if k in self.conf_info) self.time_stamp['check_time'] = dict((k, v) for k, v in self.time_stamp['check_time'].items()\ if k in self.conf_info) def check_alarm_time(self, period): for times in period: start,stop = times.split('-') if int(stop) == int(start): continue else: cur_hour = localtime()[3] if cur_hour <= int(stop) and int(start) <= cur_hour: return True return False def close_alarm(self, a, b): self.xmpp.disconnect() exit(1) def run(self): self.init_process() while self.alive: for sig in (signal.SIGTERM, signal.SIGINT, signal.SIGHUP): signal.signal(sig, self.close_alarm) # get all agent policy cur_time = int(time()) if 'conf_data' not in self.time_stamp or cur_time - self.time_stamp['conf_data'] >=15: self.conf_info = self.get_conf() self.get_data() self.time_stamp['conf_data'] = time() # put data into check queue. for item, conf in self.conf_info.items(): if item not in self.time_stamp['check_time'] or \ cur_time - self.time_stamp['check_time'][item] >= self.conf_info[item]['interval']: self.check_enqueue.put([item, conf, self.alarm_info[item]]) self.time_stamp['check_time'][item] = time() # read data from check queue. while True: try: item, data = self.check_dequeue.get(1, 0.1) except Queue.Empty: break else: self.save_data(item, data) # put data into alarm queue. for item in self.alarm_info.keys(): if item not in self.conf_info: del self.alarm_info[item] continue else: alarm_time = self.time_stamp['alarm_time'] check_time = self.time_stamp['check_time'][item] if not self.alarm_info[item]: if item in alarm_time:del alarm_time[item] continue if item not in alarm_time or cur_time - alarm_time[item] >= self.conf_info[item]['rate']: self.alarm_enqueue.put([item, self.conf_info[item], self.alarm_info[item]]) self.time_stamp['alarm_time'][item] = cur_time while True: try: item, data = self.alarm_dequeue.get(1, 0.1) except Queue.Empty: break else: # send alarm data self.send_alarm(item, data) # delete recovery errors. self.clean_memory()
class AgentMonitor(multiprocessing.Process): def __init__(self, add_queue, del_queue, **conf): multiprocessing.Process.__init__(self) self.conf = conf self.ranger = MagicShow(**self.conf['database']) self.alertr = RedisClient(db='alert', **self.conf['database']) # save all servers of this process. self.total_server = list() # queue for add new servers. self.add_queue = add_queue # queue for del deleted servers. self.del_queue = del_queue # vars for loop. self.alive = True # vars for saving client data. self.data_info = dict() # vars for saveing socket. self.connections = dict() # save fd's sockert. self.filenos = dict() # save some timestamp. self.timestamps = dict() # save new coming data. self.requests = dict() # epoll object. self.epoll = select.epoll() # save each host's configs. self.host_configs = dict() def init_connection(self): if 'client_status' not in self.data_info: self.data_info['client_status'] = dict() sock_timeout = 0.01 for host in self.total_server: # if connection is not fail, conntinue. if host in self.connections:continue sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) optval = struct.pack('ii', 1, 0) sock.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, optval) sock.settimeout(sock_timeout) try:sock.connect((host, 1021)) except socket.error as msg: self.data_info['client_status'][host] = json.dumps({'data' : 'Connection Failed', \ 'time' : int(time())}) else: sock.settimeout(None) sock.setblocking(0) self.data_info['client_status'][host] = json.dumps({'data' : 'Connection Success', \ 'time' : int(time())}) fileno = sock.fileno() self.connections[host] = sock self.filenos[fileno] = host self.requests[host] = '' self.epoll.register(fileno, select.EPOLLIN) for host in set(self.connections) - set(self.total_server): sock = self.connections[host] #delete information of delete servers self.data_info['client_status'].pop(host, None) self.recycle_connection(sock) def send_data(self, sock, data): data = "%010d%s" % (len(data), data) try: sock.sendall(data) except socket.error as msg: self.recycle_connection(sock) def list_servers(self, regex): return self.ranger.load(regex).show() def recycle_connection(self, sock): fileno = sock.fileno() host = self.filenos[fileno] # record failed clients. self.data_info['client_status'][host] = json.dumps({'data' : 'Connection Failed',\ 'time' : int(time())}) # cancel epoll event. self.epoll.unregister(fileno) # close socket self.connections[host].close() # delete records. del self.connections[host] del self.requests[host] del self.filenos[fileno] def get_config(self): # get all configs about client configs = self.alertr.hgetall('total:client:collector:configs') find_host = dict() final_data = dict() data = dict() for item, conf in configs.items(): try: conf = json.loads(conf) except Exception, e: print str(e) continue target = conf.pop('target') # cache expr which already select in redis. if target not in find_host: hosts = find_host[target] = self.list_servers(target) else: hosts = find_host[target] for host in hosts: if host not in data:data[host] = list() data[host].append(conf) final_data = dict((k, json.dumps({'action' : 'NEWCONFIG', 'data' : v})) for k, v in data.items()) return final_data if data else {}