def update_check_plan(app_config_dict, queue): logger = logging.getLogger("task.generator") pipe = REDIS_DB.pipeline() change_list = pop_multi(pipe, '_app_change_list', 30) logger.info(u'发现应用配置发生变动,应用列表:%s', change_list) for appname in set(change_list): # 获取此应用的最新配置 new_config = REDIS_DB.hgetall(appname + '_config') # 1. 此应用已经被删除了 if new_config is None: app_config_dict.pop(appname, None) continue item = app_config_dict.get(appname) # 2. 此应用是新添加的应用 if item is None: host_list = new_config['host_list'] for host in host_list.split(','): task = CheckTask(appname, host, int(new_config['check_interval'])) queue.put_task(task) else: old_host_list = item['host_list'].split(',') new_host_list = new_config['host_list'].split(',') for host in (set(new_host_list) - set(old_host_list)): task = CheckTask(appname, host, int(new_config['check_interval'])) queue.put_task(task) # 更新配置 app_config_dict[appname] = new_config
def record_app_status(task, status, info, check_latency, duration): pipe = REDIS_DB.pipeline() # 1. 获取应用的部分配置 pipe.hget(task.appname + '_config', 'max_check_attempts') pipe.hget(task.appname + '_config', 'notify_interval') pipe.hget(task.appname + '_config', 'check_interval') pipe.hget(task.appname + '_config', 'parent_app_list') res_list = pipe.execute() max_check_attempts = int(res_list[0]) notify_interval = int(res_list[1]) check_interval = int(res_list[2]) parent_app_list = res_list[3] # 2. 获取上一次状态信息 key = 'ah_' + task.appname + '_' + task.host if not REDIS_DB.exists(key): pipe.hset(key, 'current_status', 'OK') pipe.hset(key, 'status_info', 'OK - ') pipe.hset(key, 'current_attempt', 0) pipe.hset(key, 'last_state_change', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) pipe.hset(key, 'last_notification', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) pipe.execute() status_dict = REDIS_DB.hgetall(key) current_status = status_dict.pop('current_status') current_attempt = status_dict.pop('current_attempt') current_attempt = int(current_attempt) last_state_change = status_dict.pop('last_state_change') last_state_change = datetime.strptime(last_state_change, '%Y-%m-%d %H:%M:%S') last_notification = status_dict.pop('last_notification') last_notification = datetime.strptime(last_notification, '%Y-%m-%d %H:%M:%S') if current_status != status or status != 'OK': current_attempt = current_attempt + 1 pipe.hset(key, 'current_attempt', current_attempt) if current_attempt >= max_check_attempts: current_status = status pipe.hset(key, 'current_status', current_status) pipe.hset(key, 'status_info', info) pipe.hset(key, 'current_attempt', 0) pipe.hset(key, 'last_state_change', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # recovery 事件不受notify_interval影响 if status == 'OK': notify_app.apply_async( (task.appname, task.host, 'RECOVERY', status, info)) margin = total_seconds(datetime.now() - last_notification) if status != 'OK' and margin > notify_interval * 60: pipe.hset(key, 'last_notification', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) notify_app.apply_async( (task.appname, task.host, 'PROBLEM', status, info)) else: pipe.hset(key, 'current_attempt', 0) pipe.hset(key, 'duration', duration) pipe.hset(key, 'check_latency', check_latency) next_scheduled_check = task.plan_time + timedelta(seconds=check_interval * 60) next_scheduled_check = next_scheduled_check.strftime('%Y-%m-%d %H:%M:%S') pipe.hset(key, 'next_scheduled_check', next_scheduled_check) # -------- 其它特殊情况 (为了方便判断主机是否存活) ------------ if parent_app_list is not None: if parent_app_list.find('HOST_STATUS') != -1: key = 'ah_HOST_STATUS_' + task.host pipe.hset(key, 'current_status', current_status) if parent_app_list.find('SYS_PING') != -1: key = 'ah_SYS_PING_' + task.host pipe.hset(key, 'current_status', current_status) pipe.execute()
def record_app_status(task, status, info, check_latency, duration): pipe = REDIS_DB.pipeline() # 1. 获取应用的部分配置 pipe.hget(task.appname + '_config', 'max_check_attempts') pipe.hget(task.appname + '_config', 'notify_interval') pipe.hget(task.appname + '_config', 'check_interval') pipe.hget(task.appname + '_config', 'parent_app_list') res_list = pipe.execute() max_check_attempts = int(res_list[0]) notify_interval = int(res_list[1]) check_interval = int(res_list[2]) parent_app_list = res_list[3] # 2. 获取上一次状态信息 key = 'ah_' + task.appname + '_' + task.host if not REDIS_DB.exists(key): pipe.hset(key, 'current_status', 'OK') pipe.hset(key, 'status_info', 'OK - ') pipe.hset(key, 'current_attempt', 0) pipe.hset(key, 'last_state_change', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) pipe.hset(key, 'last_notification', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) pipe.execute() status_dict = REDIS_DB.hgetall(key) current_status = status_dict.pop('current_status') current_attempt = status_dict.pop('current_attempt') current_attempt = int(current_attempt) last_state_change = status_dict.pop('last_state_change') last_state_change = datetime.strptime(last_state_change, '%Y-%m-%d %H:%M:%S') last_notification = status_dict.pop('last_notification') last_notification = datetime.strptime(last_notification, '%Y-%m-%d %H:%M:%S') if current_status != status or status != 'OK': current_attempt = current_attempt + 1 pipe.hset(key, 'current_attempt', current_attempt) if current_attempt >= max_check_attempts: current_status = status pipe.hset(key, 'current_status', current_status) pipe.hset(key, 'status_info', info) pipe.hset(key, 'current_attempt', 0) pipe.hset(key, 'last_state_change', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # recovery 事件不受notify_interval影响 if status == 'OK': notify_app.apply_async((task.appname, task.host, 'RECOVERY', status, info)) margin = total_seconds(datetime.now() - last_notification) if status !='OK' and margin > notify_interval * 60: pipe.hset(key, 'last_notification', datetime.now().strftime('%Y-%m-%d %H:%M:%S')) notify_app.apply_async((task.appname, task.host, 'PROBLEM', status, info)) else: pipe.hset(key, 'current_attempt', 0) pipe.hset(key, 'duration', duration) pipe.hset(key, 'check_latency', check_latency) next_scheduled_check = task.plan_time + timedelta(seconds=check_interval * 60) next_scheduled_check = next_scheduled_check.strftime('%Y-%m-%d %H:%M:%S') pipe.hset(key, 'next_scheduled_check', next_scheduled_check) # -------- 其它特殊情况 (为了方便判断主机是否存活) ------------ if parent_app_list is not None: if parent_app_list.find('HOST_STATUS') != -1: key = 'ah_HOST_STATUS_' + task.host pipe.hset(key, 'current_status', current_status) if parent_app_list.find('SYS_PING') != -1: key = 'ah_SYS_PING_' + task.host pipe.hset(key, 'current_status', current_status) pipe.execute()