Exemplo n.º 1
0
    def write_status(self,
                     total_count,
                     success_count,
                     failed_count,
                     alarm_level,
                     error_record,
                     monitor_type,
                     monitor_key,
                     error_message=''):
        logging.info('write status!')
        result_dict = {}
        format_str = "total=%s, success count=%s, failed count=%s, %s"
        format_values = (total_count, success_count, failed_count,
                         error_message)
        message = format_str % format_values
        dt = datetime.datetime.now()
        result_dict.setdefault("message", message)
        result_dict.setdefault("alarm", alarm_level)
        result_dict.setdefault("error_record", error_record)
        result_dict.setdefault("ctime", dt.strftime(TIME_FORMAT))

        logging.info("monitor_type:" + monitor_type + " monitor_key:" + \
                     monitor_key + " monitor_value:" + str(result_dict))

        zkOper = Scheduler_ZkOpers()
        zkOper.write_monitor_status(monitor_type, monitor_key, result_dict)
Exemplo n.º 2
0
    def run(self):
        isLock, lock = False, None

        zkOper = Scheduler_ZkOpers()
        try:
            isLock, lock = zkOper.lock_async_monitor_action()
        except kazoo.exceptions.LockTimeout:
            logging.info(
                "a thread is running the monitor async, give up this oper on this machine!"
            )
            return

        if not isLock:
            return

        try:
            begin_time = time.time()
            logging.info("do monitor work every five minutes")
            self.__action_monitor_check()
            while True:
                end_time = time.time()
                if int(end_time - begin_time) > (self.timeout - 2):
                    logging.info(
                        'release the log, get lock time: %s, release time: %s,\n total time : %s'
                        % (str(begin_time), str(end_time),
                           int(end_time - begin_time)))
                    break
            time.sleep(1)
        except Exception:
            self.threading_exception_queue.put(sys.exc_info())
        finally:
            if isLock:
                zkOper.unLock_aysnc_monitor_action(lock)
Exemplo n.º 3
0
    def check(self):
        monitor_type, error_record = 'container', []
        failed_count = 0

        logging.info('do check under_oom')
        zk_opers = Scheduler_ZkOpers()

        server_list = zk_opers.retrieve_servers_white_list()
        for server in server_list:
            under_oom_info = zk_opers.retrieveDataNodeContainersResource(
                server, self.monitor_key)
            '''
                if new server join server cluster,there
            '''
            if not under_oom_info:
                return
            container_under_oom_dict = under_oom_info.get(self.monitor_key)
            for container, under_oom_value in container_under_oom_dict.items():
                if under_oom_value != self.value:
                    error_record.append(container)
                    failed_count = len(error_record)

        alarm_level = self.retrieve_alarm_level(0, 0, failed_count)
        self.write_status(0, 0, failed_count, alarm_level, error_record,
                          monitor_type, self.monitor_key)
Exemplo n.º 4
0
    def check(self):
        monitor_type, monitor_key = 'server', 'memory'
        zk_opers = Scheduler_ZkOpers()

        host_ip_list = zk_opers.retrieve_data_node_list()
        if not host_ip_list:
            return

        server_node_value = zk_opers.retrieve_monitor_server_value()
        logging.info('monitor server resource threshold:%s' %
                     str(server_node_value))
        memory_threshold = server_node_value.get('memory_threshold')
        memory_threshold_m = memory_threshold / 1024 / 1024

        error_record, host_mem = [], {}
        for host_ip in host_ip_list:
            host_mem = ServerRes.retireve_server_memory(host_ip)
            if host_mem["free"] < memory_threshold_m:
                error_record.append('%s' % host_ip)

        alarm_level = self.retrieve_alarm_level(
            len(host_ip_list),
            len(host_ip_list) - len(error_record), len(error_record))
        error_message = "remaining memory is less than %s M" % memory_threshold_m
        super(CheckResMemory,
              self).write_status(len(host_ip_list),
                                 len(host_ip_list) - len(error_record),
                                 len(error_record), alarm_level, error_record,
                                 monitor_type, monitor_key, error_message)
Exemplo n.º 5
0
    def check(self):
        monitor_type, monitor_key = 'server', 'disk_io'
        zk_opers = Scheduler_ZkOpers()

        host_ip_list = zk_opers.retrieve_data_node_list()
        server_threshold = zk_opers.retrieve_monitor_server_value()
        MAX_READ_IOPS = server_threshold.get("disk_threshold_read", 0)
        MAX_WRITE_IOPS = server_threshold.get("disk_threshold_write", 0)
        if not host_ip_list:
            return
        error_record, host_disk = [], {}

        for host_ip in host_ip_list:
            host_disk = ServerRes.retireve_server_diskiops(host_ip)
            if host_disk["read_iops"]*1024 > MAX_READ_IOPS or \
                 host_disk["write_iops"]*1024 > MAX_WRITE_IOPS:
                error_record.append('%s' % host_ip)

        total_count = len(host_ip_list)
        failed_count = len(error_record)
        success_count = total_count - failed_count
        alarm_level = self.retrieve_alarm_level(total_count, success_count,
                                                failed_count)
        error_message = "disk read iops greater than %d or write iops greater than %d" \
                        % (MAX_READ_IOPS,MAX_WRITE_IOPS)

        super(CheckServerDiskIO,
              self).write_status(total_count, success_count, failed_count,
                                 alarm_level, error_record, monitor_type,
                                 monitor_key, error_message)
Exemplo n.º 6
0
    def run(self):

        try:
            zk_opers = Scheduler_ZkOpers()
            cluster_list = zk_opers.retrieve_cluster_list()
            if not cluster_list:
                logging.info('no cluster is created, no need to do this!')
                return
            self.__action_record_containers_resource()
        except Exception:
            self.threading_exception_queue.put(sys.exc_info())
Exemplo n.º 7
0
 def run(self):
     logging.info('do sync server')
     zkOper = Scheduler_ZkOpers()
     try:
         cluster_list = zkOper.retrieve_cluster_list()
         if not cluster_list:
             logging.info(
                 'no cluster is created, no need to update such infomation!'
             )
             return
         self.server_opers.sync()
     except Exception:
         self.threading_exception_queue.put(sys.exc_info())
Exemplo n.º 8
0
    def check(self):

        monitor_type, monitor_key, error_record = 'res', 'port_num', ''

        zk_opers = Scheduler_ZkOpers()
        host_ip_list = zk_opers.retrieve_data_node_list()
        for host_ip in host_ip_list:
            success_count = self.port_opers.get_port_num(host_ip)
            if success_count < 30:
                error_record += 'the number of port in port Pool is %s on server :%s, please add ports!\n' % (
                    success_count, host_ip)

        alarm_level = self.retrieve_alarm_level(0, success_count, 0)
        super(CheckServerPortNum, self).write_status(0, 0, 0, \
                                                     alarm_level, error_record,
                                                     monitor_type, monitor_key)
Exemplo n.º 9
0
 def run(self):
     isLock, lock = False, None
     
     zkOper = Scheduler_ZkOpers()
     try:
         isLock, lock = zkOper.lock_check_ip_usable_action()
     except kazoo.exceptions.LockTimeout:
         logging.info("a thread is running the monitor async, give up this oper on this machine!")
         return
     
     if not isLock:
         return
     
     try:
         self.check_ip_legality.check()
     except Exception:
         self.threading_exception_queue.put(sys.exc_info())
     finally:
         if isLock:
             zkOper.unLock_check_ip_usable_action(lock)
Exemplo n.º 10
0
    def check(self):

        monitor_type, monitor_key, error_record = 'beehive', 'node', ''
        zk_opers = Scheduler_ZkOpers()
        host_ip_list = zk_opers.retrieve_data_node_list()
        beehive_port, monitor_port = 8888, 6666
        alarm_level = options.alarm_nothing
        for host_ip in host_ip_list:
            beehive_ret = nc_ip_port_available(host_ip, beehive_port)
            if not beehive_ret:
                alarm_level = options.alarm_serious
                error_record += 'server:%s , beehive service is not running, please check!;' % host_ip

            monitor_ret = nc_ip_port_available(host_ip, monitor_port)
            if not monitor_ret:
                alarm_level = options.alarm_serious
                error_record += 'server:%s , container-monitor-agent service is not running, please check!;' % host_ip

        super(CheckBeehiveAlived,
              self).write_status(0, 0, 0, alarm_level, error_record,
                                 monitor_type, monitor_key)
Exemplo n.º 11
0
    def check(self):
        monitor_type, monitor_key = 'server', 'diskusage'
        zk_opers = Scheduler_ZkOpers()

        host_ip_list = zk_opers.retrieve_data_node_list()
        if not host_ip_list:
            return

        error_record, host_disk = [], {}

        for host_ip in host_ip_list:
            host_disk = ServerRes.retireve_server_diskusage(host_ip)
            if host_disk["used"] > host_disk["total"] * 0.85:
                error_record.append('%s' % host_ip)

        alarm_level = self.retrieve_alarm_level(
            len(host_ip_list),
            len(host_ip_list) - len(error_record), len(error_record))
        error_message = "disk capacity utilization rate is greater than 85% !"
        super(CheckServerDiskUsage,
              self).write_status(len(host_ip_list),
                                 len(host_ip_list) - len(error_record),
                                 len(error_record), alarm_level, error_record,
                                 monitor_type, monitor_key, error_message)