def check(self, data_node_info_list): zkOper = Scheduler_ZkOpers() started_nodes_list = zkOper.retrieve_started_nodes() error_record = {} ip = [] for data_node_ip in started_nodes_list: ip.append(data_node_ip) error_record.setdefault("online_ip", ip) total_count = len(data_node_info_list) success_count = len(started_nodes_list) failed_count = total_count - success_count monitor_type = "node" monitor_key = "started" alarm_level = self.retrieve_alarm_level(total_count, success_count, failed_count) super(Check_Node_Active, self).write_status(total_count, success_count, failed_count, alarm_level, error_record, monitor_type, monitor_key) super(Check_Node_Active, self).write_status_to_es(total_count, success_count, failed_count, alarm_level, error_record, monitor_type, monitor_key)
def _get_check_user_list(self): conn = self.dba_opers.get_mysql_connection() user_tuple = self.dba_opers.get_db_users(conn) user_mysql_src_dict, user_zk_src_list = {}, [] zkOper = Scheduler_ZkOpers() # We convert origin tuple grabbed from mysql into list, # then combine the elements subscripted 0 ,1 as key of # dict and combine the elements subscripted -3, -4 ,-5, -6 # as the value of the dict.Finally we append the dict into list. for t in user_tuple: inner_value_list = [] dict_key_str = (list(t)[1] + "|" + list(t)[0]) inner_value_list.append(list(t)[-3]) inner_value_list.append(list(t)[-4]) inner_value_list.append(list(t)[-5]) inner_value_list.append(list(t)[-6]) user_mysql_src_dict.setdefault(dict_key_str, inner_value_list) db_list = zkOper.retrieve_db_list() for db_name in db_list: db_user_list = zkOper.retrieve_db_user_list(db_name) logging.info("dbName: " + db_name + " db_user_list : " + str(db_user_list)) for db_user in db_user_list: inner_list = [] inner_list.append(db_user) prop = zkOper.get_db_user_prop(db_name, db_user) inner_list.append(prop) user_zk_src_list.append(inner_list) return user_mysql_src_dict, user_zk_src_list
def write_status(self, total_count, success_count, failed_count, alarm_level, error_record_dict, monitor_type, monitor_key, timeout_num_threshold=3): dt = datetime.datetime.now() _include_timeout_num_from_response = 0 if {} != error_record_dict: _error_record_message = error_record_dict.get('msg') _include_timeout_list = re.findall(r'HTTP 599:', str(_error_record_message)) _include_timeout_num_from_response = len(_include_timeout_list) _timeout_num_from_zk = 0 zkOper = Scheduler_ZkOpers() if _include_timeout_num_from_response > 0: _monitor_value_dict = zkOper.retrieve_monitor_status_value( monitor_type, monitor_key) _timeout_num = _monitor_value_dict.get("timeout_num") if _timeout_num is not None: _timeout_num_from_zk = _timeout_num _timeout_num_from_zk += 1 if _timeout_num_from_zk <= timeout_num_threshold and _include_timeout_num_from_response > 0: success_count = total_count failed_count = 0 alarm_level = "nothing" error_record_dict = {} else: _timeout_num_from_zk = 0 result_dict = { "message": "total=%s, success count=%s, failed count=%s" % (total_count, success_count, failed_count), "alarm": alarm_level, "error_record": error_record_dict, "ctime": dt.strftime('%Y-%m-%d %H:%M:%S'), "timeout_num": _timeout_num_from_zk } zkOper.write_monitor_status(monitor_type, monitor_key, result_dict)
def check_status(self, data_node_info_list, url_post, monitor_type, monitor_key): zk_data_node_count = len(data_node_info_list) zkOper = Scheduler_ZkOpers() self._check_cluster_status(zk_data_node_count) self._check_node_status(data_node_info_list, url_post, monitor_type, monitor_key)
def check(self, data_node_info_list): zkOper = Scheduler_ZkOpers() if not is_monitoring(get_localhost_ip(), zkOper): return conn = self.dba_opers.get_mysql_connection() monitor_type, monitor_key = "db", "existed_db_anti_item" error_record = {} anti_item_count, msg, failed_count = 0, "", 0 _path_value = zkOper.retrieve_monitor_status_value( monitor_type, monitor_key) if _path_value != {}: failed_count = int( re.findall(r'failed count=(\d)', _path_value['message'])[0]) if conn == None: failed_count += 1 if failed_count > 4: anti_item_count = 500 error_record.setdefault("msg", "no way to connect to db") else: try: anti_item_count, msg, anti_item_detail = self._anti_item_check( conn) finally: conn.close() if anti_item_count > 0: error_record.setdefault( "msg", "mcluster existed on %s please check which db right now." % (msg)) error_record.setdefault("detail", anti_item_detail) logging.info(error_record) alarm_level = self.retrieve_alarm_level(anti_item_count, 0, 0) logging.info("existed anti_item alarm_level :%s" % (alarm_level)) super(Check_DB_Anti_Item, self).write_status(anti_item_count, 0, failed_count, alarm_level, error_record, monitor_type, monitor_key) super(Check_DB_Anti_Item, self).write_status_to_es(anti_item_count, 0, failed_count, alarm_level, error_record, monitor_type, monitor_key)
def run(self): ''' if no logic below, singleton Scheduler_ZkOpers may have no self.zk object. ''' begin_time = time.time() lock_name = 'async_monitor/' + self.monitor_type zkOper = Scheduler_ZkOpers() logging.info('check zk is connected :%s' % str(zkOper.is_connected())) isLock, lock = None, None try: isLock, lock = zkOper.lock_async_monitor_action(lock_name) if not isLock: return except kazoo.exceptions.LockTimeout: logging.info("a thread is running the monitor async, give up this oper on this machine!") return try: data_node_info_list = zkOper.retrieve_data_node_list() getattr(self, '_async_' + self.monitor_type)(data_node_info_list) end_time = time.time() monitor_exc_time = int(end_time - begin_time) '''leave timeout for sleep ''' real_time_out = self.timeout - self.time_constant if monitor_exc_time < real_time_out: time.sleep(real_time_out - monitor_exc_time) logging.info("%s task has finished" %self.monitor_type) except Exception, e: self.threading_exception_queue.put(sys.exc_info())
def check(self, data_node_info_list): #url_post = "/dbuser/inner/check" zkOper = Scheduler_ZkOpers() if not is_monitoring(get_localhost_ip(), zkOper): return monitor_type, monitor_key = "db", "dbuser" user_mysql_src_dict, user_zk_src_list = self._get_check_user_list() error_record, differ_dict_set = {}, {} count_dict_set = dict(total=0, failed=0, success=0) if len(user_zk_src_list) == 0 and len(user_mysql_src_dict) == 0: error_record.setdefault( "msg", "no database users in zk neither in mysql") differ_dict_set.setdefault("Empty", "") else: self.compare_center(user_mysql_src_dict, user_zk_src_list, differ_dict_set, count_dict_set) count_dict_set[ "total"] = count_dict_set["success"] + count_dict_set["failed"] alarm_level = self.retrieve_alarm_level(count_dict_set["total"], count_dict_set["success"], count_dict_set["failed"]) total_count = count_dict_set["total"] failed_count = count_dict_set["failed"] success_count = count_dict_set["success"] if differ_dict_set: error_record.setdefault("dif", differ_dict_set) super(Check_Database_User, self).write_status(total_count, success_count, failed_count, alarm_level, error_record, monitor_type, monitor_key) super(Check_Database_User, self).write_status_to_es(total_count, success_count, failed_count, alarm_level, error_record, monitor_type, monitor_key)
def _check_cluster_status(self, zk_data_node_count): zkOper = Scheduler_ZkOpers() pre_stat = zkOper.retrieveClusterStatus() ''' The following logic expression means 1. if we don't have the cluster_status node in zookeeper we will get pre_stat as {}, we will create the path in the following process. 2. else the pre_stat is not {}, then it must have value in pre_stat dictionary and judge whether it is right or not. ''' if pre_stat.has_key('_status') and pre_stat[ '_status'] != 'initializing' or pre_stat == {}: online_node_list = zkOper.retrieve_started_nodes() result = {} online_num = len(online_node_list) if zk_data_node_count == online_num: result['_status'] = 'running' elif zk_data_node_count / 2 + 1 <= online_num < zk_data_node_count: result['_status'] = 'sub-health' else: result['_status'] = 'failed' zkOper.writeClusterStatus(result)