def rabbitmq_queue_alarm_job(): queue = {'log': None, 'purify': None, 'upload': None, 'download': None, 'encode': None, 'entry': None, 'store': None} try: # lines should be only one row lines = read_file("/tmp/rabbitmq_queue.o") if not lines: c.logger.error("/tmp/rabbitmq_queue.o is blank") info = lines[0] # info = r"encode:|entry:|purify:|download:|log:|upload:|store:" item_list = info.split('|') if not item_list: c.logger.error("queue_info parse error") for item in item_list: tmp = item.split(':') if tmp[1] and tmp[1] != '\n': queue[tmp[0]] = int(tmp[1]) error_q = [] for q in queue.keys(): if q == 'log': if queue[q] is None or queue[q] >= 128: error_q.append("%s:%s" % (q, queue[q])) else: if queue[q] is None or queue[q] >= 32: error_q.append("%s:%s" % (q, queue[q])) if error_q: time = get_date_and_time()[1] content = '|'.join(error_q) msg = '[%s] %s' % (time, content) sms(mobile_list=c.mobile_list, message_post=msg) except Exception, e: c.logger.error(e)
def read_alarm_job(): start_time = datetime.datetime.now() - datetime.timedelta(minutes=36) read_failure_data = AppAvailableData.objects.filter(name='read', result=False, time__gte=start_time).values('time') failure_count = len(read_failure_data) if failure_count >= c.read_alarm_time: try: start_time = read_failure_data[0]['time'] end_time = read_failure_data[failure_count - 1]['time'] type = 'read_bookmark' time = get_date_and_time()[1] msg = 'read failure count:%s,%s' % (str(failure_count), time) sms(mobile_list=c.mobile_list, message_post=msg) # 获取上一次的报警信息 latest = SysAlarm.objects.filter(type=type).order_by('-gmt_create') if latest: latest = latest[0] # start_time是本次报警的开始时间 # latest.end_time是上次报警的结束时间 # 所有在算时间差的时候, start_time应该是时间差的end, latest.end_time应该是时间差的start tdiff = timediff(latest.end_time, start_time) if (not latest) or start_time > latest.end_time and tdiff > 600: # 一次新的故障 alarm = SysAlarm(type=type, start_time=start_time, end_time=end_time) alarm.save() else: # 添加-报警的定时任务是每10min执行一次, 如果本次报警的时间和上次开始的时间<10min, 则认为是同一次故障 # 同一次故障的持续报警, 只需修改上次的结束时间即可 latest.end_time = end_time latest.save() else: # 以前没有过这样类型的故障 alarm = SysAlarm(type=type, start_time=start_time, end_time=end_time) alarm.save() except Exception, e: c.logger.error(e)