def checkslave(self): msg = '' ret = subprocess.Popen('%s -u%s -p%s -S %s -e "show slave status\G;"' % (self.mysql, self.user, self.passwd, self.sock), stdout=subprocess.PIPE, shell=True, stderr=subprocess.PIPE) err = ret.stderr.read() out = ret.stdout.readlines() if err or (not out): base.MQ.put("%s [ERROR] mysql -e执行异常!stderr:%s" % (base.TIME(), err.strip())) msg = '%sMysql -e执行异常!' % msg for i in out: if i.strip().startswith('Slave_IO_Running:') or i.strip( ).startswith('Slave_SQL_Running:'): if i.split(':')[1].strip() == 'No': base.MQ.put("%s [ERROR] Mysql主从复制失败!%s" % (base.TIME(), i.strip())) msg = '%sMysql主从复制失败' % msg break elif i.strip().startswith('Seconds_Behind_Master:'): count = int(i.split(':')[1].strip()) if count > self.rep_delay_time: base.MQ.put("%s [ERROR] Mysql主从复制延迟!%s" % (base.TIME(), i.strip())) msg = '%sMysql主从复制延迟%s' % (msg, count) return msg
def check(self): count = [] try: pos = os.path.getsize('/var/log/secure') if pos < self.f_pos: self.close() self.f = open('/var/log/secure') self.f_pos = 0 self.f.seek(self.f_pos) return records = self.f.readlines() if records: self.f_pos = os.path.getsize('/var/log/secure') for record in records: if 'Accepted' in record: base.MQ.put("%s [DEBUG] CheckLogin Accepted记录:%s"%(base.TIME(),record)) timestamp_year = base.time.strftime('%Y',base.time.localtime()) timestamp_log = "%s%s"%(record[:16],timestamp_year) timestamp_now_sec = base.time.time() timestamp_log_sec = base.time.mktime(base.time.strptime(timestamp_log,"%b %d %H:%M:%S %Y")) find_ip = self.pattern.findall(record) for i in find_ip: t =timestamp_now_sec - timestamp_log_sec #print "时间判断:%s,%s"%(t,i) if (timestamp_now_sec - timestamp_log_sec) < 10: if not i in self.access_ip_list: #print "%s IP:%s login accepted!"%(base.TIME(),i) base.MQ.put("%s [WARNING] IP:%s login!"%(base.TIME(),i)) count.append(i) if count: base.ALARM_DICT['sec'][0] = "IP:%s异常登陆!"%(','.join(count)) except Exception,e: base.MQ.put("%s [ERROR] 读取日志文件:/var/log/secure 出错!%s"%(base.TIME(),e))
def process_default(self,event): if event.name.endswith('.swp') or event.name.endswith('.swx') or event.name.endswith('.swpx') or event.name.endswith('~') or event.name.endswith('.swo') or ('prelink' in event.name) : # print 'tmpfile:%s'%event.name pass elif event.maskname == 'IN_MODIFY': base.MQ.put('%s [WARNING] %s Modified! EVENT NAME:%s'%(base.TIME(),event.pathname,event.name)) base.ALARM_DICT['sec'][1] += 1 elif event.maskname == 'IN_DELETE': base.MQ.put('%s [WARNING] %s Deleted!'%(base.TIME(),event.pathname)) base.ALARM_DICT['sec'][1] += 1
def main(conf): #conf = Config() threads = [] for i in conf.get_sections(): threads.append( threading.Thread(name='monitor_%s' % i, target=eval('thread_%s' % i), args=(conf, ))) for i in threads: i.setDaemon(True) for i in threads: i.start() while True: try: base.time.sleep(60) thread_acount = threading.activeCount() base.MQ.put( "%s [DEBUG] 总线程数:%s[%s],当前线程数:%s" % (base.TIME(), len(threads) + 1, threads, thread_acount)) if len(threads) + 1 > thread_acount: if "monitor_agent进程异常" not in base.ALARM_DICT['proc']: base.ALARM_DICT[ 'proc'] = 'monitor_agent进程异常!%s' % base.ALARM_DICT[ 'proc'] except KeyboardInterrupt: print "exit..." break
def Alert(url, msg): try: req = urllib.urlopen(url, data=msg) except IOError, e: base.MQ.put("%s [CRIT] 提交报警任务出错,请确认url地址可访问!:%s url=%s msg: %s" % (base.TIME(), e, url, msg)) return False
def thread_proc(conf): print "thread_proc running..." try: proc_list = conf.get('proc') except Exception, e: base.MQ.put("%s [ERROR] 配置文件错误,monitor_proc线程退出!%s" % (base.TIME(), e)) base.time.sleep(1) sys.exit(1)
def get(self, *args): l = len(args) try: if l == 1: return self.config.items(args[0]) elif l == 2: return self.config.get(args[0], args[1]) except (ConfigParser.NoSectionError, ConfigParser.NoOptionError), e: base.MQ.put("%s [ERROR] 配置文件读取错误:%s" % (base.TIME(), e))
def __init__(self,access_ip_list): self.access_ip_list = access_ip_list self.pattern = re.compile('\d{0,3}\.\d{0,3}\.\d{0,3}\.\d{0,3}') if os.path.exists('/var/log/secure'): self.f = open('/var/log/secure') self.f_pos = os.path.getsize('/var/log/secure') self.f.seek(self.f_pos) else: self.f = None self.f_pos = 0 base.MQ.put("%s [ERROR] 系统日志文件:/var/log/secure 丢失!"%base.TIME())
def thread_system(conf): print "thread_system running..." try: threshold_cpu = conf.get('system', 'cpu') threshold_cpu_core = conf.get('system', 'cpu_core') threshold_mem = conf.get('system', 'mem') threshold_disk = conf.get('system', 'disk') threshold_inode = conf.get('system', 'inode') threshold_load = conf.get('system', 'load') threshold_net = conf.get('system', 'net') except Exception, e: base.MQ.put("%s [ERROR] 配置文件错误,monitor_system线程退出!%s" % (base.TIME(), e)) base.time.sleep(1) sys.exit(1)
def thread_mysql(conf): print "thread_mysql running..." try: mysqlbin = conf.get('mysql', 'mysql') sock = conf.get('mysql', 'sock') errlog = conf.get('mysql', 'errlog') pattern = conf.get('mysql', 'pattern') role = conf.get('mysql', 'role') user = conf.get('mysql', 'user') passwd = conf.get('mysql', 'passwd') rep_delay_time = conf.get('mysql', 'rep_delay_time') except Exception, err: print 'monitor thread-mysql error! %s' % err base.MQ.put("%s [ERROR] monitor_mysql线程配置错误!%s" % (base.TIME(), err)) print 'thread-mysql exit...' os.kill(os.getpid(), 9) sys.exit(1)
def thread_sec(conf): print "thread_sec running..." #初始化变量 #定义监视的事件 #pdb.set_trace() mask = sec.IN_MODIFY | sec.IN_DELETE try: path = conf.get('sec', 'md_sys_path').split(':') path.extend(conf.get('sec', 'md_app_path').split(':')) m_files = conf.get('sec', 'md_file').split(':') access_ip_list = conf.get('sec', 'access_ip_list').split(":") except Exception, e: base.MQ.put("%s [ERROR] 配置文件错误,monitor_sec线程退出!%s" % (base.TIME(), e)) base.time.sleep(1) sys.exit(1)
def CheckProc(proc_list): count = [] try: for i in proc_list: proc_attr = i[1].split(':') try: proc_exe = proc_attr[0].strip() proc_port = proc_attr[1].strip() # 进程退出,自动启动 proc_reload = proc_attr[2].strip() except Exception, e: base.MQ.put('%s [ERROR] %s 进程监控配置格式出错:%s' % (base.TIME(), i[0], e)) continue if proc_port: pid = os.popen( "lsof -i4:%s|grep LISTEN|sed -n '1p'|awk '{print $2}'" % proc_port).read().strip() if os.path.exists( '/proc/%s/exe' % pid) and (proc_exe == os.readlink( '/proc/%s/exe' % pid)): continue else: #print "进程退出:%s"%i[0] base.MQ.put('%s [WARNING] %s 进程退出!' % (base.TIME(), i[0])) count.append(i[0]) if proc_reload: subprocess.Popen(proc_reload, shell=True) base.MQ.put('%s [INFO] %s:执行进程自动重启!' % (base.TIME(), i[0])) else: pids = os.popen("ps -eo pid,cmd|grep %s|awk '{print $1}'" % i[0]).read().strip().split(os.linesep) base.MQ.put("%s [DEBUG] 进程PIDS:%s,%s,%s" % (base.TIME(), pids, i[0], proc_exe)) exes = [ os.readlink('/proc/%s/exe' % pid) for pid in pids if os.path.exists('/proc/%s/exe' % pid) ] if proc_exe not in exes: base.MQ.put('%s [WARNING] %s 进程退出!' % (base.TIME(), i[0])) count.append(i[0]) if proc_reload: subprocess.Popen(proc_reload, shell=True) base.MQ.put('%s [INFO] %s:执行进程自动重启!' % (base.TIME(), i[0])) continue if count: base.ALARM_DICT['proc'] = '%s 进程退出!' % (','.join(count)) else: base.ALARM_DICT['proc'] = ''
def thread_base(conf): ''' 负责写日志及提交信息到告警接口,日志格式说明: xxxx-xx-xx xx:xx:xx [标记] 消息 ------------------- | |___日期及时间 | | +---- [DEBUG] 调试信息 |____ [INFO] 系统运行状态信息 |____ [WARNING] 监控到系统出现异常 |____ [ERROR] 一般程序错误信息 |____ [CRITICAL] 严重错误,如提交警失败 ''' print "thread_base running..." try: log = conf.get('base', 'logfile') apiurl = conf.get('base', 'apiurl') log_level = conf.get('base', 'log_level') except Exception, e: print "%s [ERROR] 配置文件错误,monitor_base线程退出!%s" % (base.TIME(), e) sys.exit(1)
#print "进程退出:%s"%i[0] base.MQ.put('%s [WARNING] %s 进程退出!' % (base.TIME(), i[0])) count.append(i[0]) if proc_reload: subprocess.Popen(proc_reload, shell=True) base.MQ.put('%s [INFO] %s:执行进程自动重启!' % (base.TIME(), i[0])) else: pids = os.popen("ps -eo pid,cmd|grep %s|awk '{print $1}'" % i[0]).read().strip().split(os.linesep) base.MQ.put("%s [DEBUG] 进程PIDS:%s,%s,%s" % (base.TIME(), pids, i[0], proc_exe)) exes = [ os.readlink('/proc/%s/exe' % pid) for pid in pids if os.path.exists('/proc/%s/exe' % pid) ] if proc_exe not in exes: base.MQ.put('%s [WARNING] %s 进程退出!' % (base.TIME(), i[0])) count.append(i[0]) if proc_reload: subprocess.Popen(proc_reload, shell=True) base.MQ.put('%s [INFO] %s:执行进程自动重启!' % (base.TIME(), i[0])) continue if count: base.ALARM_DICT['proc'] = '%s 进程退出!' % (','.join(count)) else: base.ALARM_DICT['proc'] = '' except Exception, e: base.MQ.put('%s [ERROR] 进程监控程序出错:%s' % (base.TIME(), e))
def hash_file(self,f): if os.path.exists(f): return hashlib.md5(open(f).read()).hexdigest() else: base.MQ.put("%s [ERROR] 监控文件不存在:%s"%(base.TIME(),f))
return socket.inet_ntoa( fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s', interface[:15]))[20:24]) def Alert(url, msg): try: req = urllib.urlopen(url, data=msg) except IOError, e: base.MQ.put("%s [CRIT] 提交报警任务出错,请确认url地址可访问!:%s url=%s msg: %s" % (base.TIME(), e, url, msg)) return False try: ret = json.loads(req.read()) except ValueError, e: base.MQ.put('%s [CRIT] 报警API接口返回值异常,可能非json格式:%s' % (base.TIME(), e)) return False base.MQ.put('%s [DEBUG] 提交报警API URL地址json结果:%s' % (base.TIME(), ret)) httpcode = str(req.getcode()) if httpcode.startswith('2'): if ret['content'] in ("server status ok", "success", "auto repaired"): return True elif ret['content']['message'] != 'alarm is alarming': base.MQ.put("%s [CRIT] 提交报警任务失败! url=%s msg: %s ret:%s" % (base.TIME(), url, msg, ret)) return False else: base.MQ.put("%s [CRIT] 提交报警任务失败!POST返回状态码:%s url=%s msg: %s" % (base.TIME(), httpcode, url, msg)) return False return True