async def main(): app = web.Application() app.router.add_route('GET', '/', index) app.router.add_route('GET', '/checkStatus', check_status) app.router.add_route('POST', '/runMonitor', run_monitor) app.router.add_route('POST', '/getMonitor', get_monitor) app.router.add_route('GET', '/getGC/{port}', get_gc) runner = web.AppRunner(app) await runner.setup() site = web.TCPSite(runner, cfg.getServer('host'), cfg.getServer('port')) await site.start()
async def main(): app = web.Application() aiohttp_jinja2.setup( app, loader=jinja2.FileSystemLoader('templates')) # 将模板添加到搜索路径 app.router.add_static('/static/', path=os.path.join( os.path.dirname(os.path.abspath(__file__)), 'static')) # 将静态文件添加到搜索路径 app.router.add_route('GET', '/', index) app.router.add_route('GET', '/startMonitor', start_monitor) app.router.add_route('GET', '/getMonitor/{host}', get_monitor) app.router.add_route('GET', '/Visualize', visualize) app.router.add_route('GET', '/getPortAndDisk/{host}', get_port_disk) app.router.add_route('POST', '/Register', registers) app.router.add_route('POST', '/runMonitor', run_monitor) app.router.add_route('POST', '/plotMonitor', plot_monitor) app.router.add_route('POST', '/Notification', notice) runner = web.AppRunner(app) await runner.setup() # site = web.TCPSite(runner, cfg.getServer('host'), cfg.getServer('port')) site = web.TCPSite(runner, get_ip(), cfg.getServer('port')) await site.start()
async def stop_monitor(request): pid = port_to_pid(cfg.getServer('port')) if pid: _ = os.popen(f'kill -9 {pid}') logger.info('Stop the client successfully!') return web.Response(body='Stop the client successfully!') else: return web.Response(body='Client is not running!')
async def get_monitor(request): """ 获取监控端口列表 :param request: :return: """ data = await request.json() host = data.get('host') if host == cfg.getServer('host'): msg = permon.start if len(msg['port']) > 0: # 是否监控过端口 data = {'host': [host] * len(msg['port'])} data.update(msg) return web.json_response({'code': 0, 'msg': '操作成功', 'data': data}) else: logger.error('暂未监控任何端口') return web.json_response({ 'code': 1, 'msg': '暂未监控任何端口', 'data': { 'host': host, 'port': None, 'pid': None } }) else: logger.error('请求参数异常') return web.json_response({ 'code': 2, 'msg': '请求参数异常', 'data': { 'host': host, 'port': None, 'pid': None } })
def notification(msg): """ 发送邮件通知 :param msg: 邮件正文信息 :return: """ url = f'http://{cfg.getMaster("host")}:{cfg.getMaster("port")}/Notification' header = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate", "Content-Type": "application/json; charset=UTF-8" } post_data = {'host': cfg.getServer('host'), 'msg': msg} logger.debug(f'发送邮件信息的内容:{msg}') res = requests.post(url=url, json=post_data, headers=header) if res.status_code == 200: response = json.loads(res.content.decode()) if response['code'] == 0: logger.info('邮件发送成功') else: logger.error(response['msg']) else: logger.error('邮件发送失败')
async def run_monitor(request): """ 开始监控接口 :param request: :return: """ try: data = await request.json() host = data.get('host') port = data.get('port') network = data.get('net') is_run = data.get('isRun') if host == cfg.getServer('host'): if port: pid = port_to_pid(port) # 根据端口号查询进程号 if pid is None: logger.warning(f"端口 {port} 未启动!") return web.json_response({ 'code': 1, 'msg': f"端口 {port} 未启动!", 'data': { 'host': host, 'port': port, 'pid': None } }) if is_run == '0': # 如果是停止监控 if port in permon.stop['port']: # 端口是否监控过 permon.stop = { 'port': port, 'pid': pid, 'net': network, 'is_run': 0 } logger.info('停止监控成功!') return web.json_response({ 'code': 0, 'msg': '停止监控成功!', 'data': { 'host': host, 'port': port, 'pid': pid } }) else: logger.warning(f"端口 {port} 未监控,请先监控!") return web.json_response({ 'code': 1, 'msg': f"端口 {port} 未监控,请先监控!", 'data': { 'host': host, 'port': port, 'pid': pid } }) if is_run == '1': # 如果是开始监控 permon.start = {'port': port, 'pid': pid, 'is_run': 1} logger.info('开始监控成功!') return web.json_response({ 'code': 0, 'msg': '开始监控成功!', 'data': { 'host': host, 'port': port, 'pid': pid } }) else: logger.error('请求参数异常') return web.json_response({ 'code': 2, 'msg': '请求参数异常', 'data': { 'host': host, 'port': port, 'pid': None } }) else: logger.error('请求参数异常') return web.json_response({ 'code': 2, 'msg': '请求参数异常', 'data': { 'host': host, 'port': port, 'pid': None } }) except Exception as err: logger.error(err) logger.error(traceback.format_exc()) return web.json_response({ 'code': 2, 'msg': err, 'data': { 'host': cfg.getServer('host'), 'port': None, 'pid': None } })
def __init__(self): self.check_sysstat_version() self.IP = get_ip() self.thread_pool = cfg.getServer( 'threadPool') if cfg.getServer('threadPool') >= 0 else 0 self._msg = { 'port': [], 'pid': [], 'isRun': [], 'startTime': [] } # port、pid、status、startTime self.is_system = cfg.getMonitor( 'isMonSystem') # Whether to monitor the server system self.error_times = cfg.getMonitor('errorTimes') self.sleepTime = cfg.getMonitor('sleepTime') self.maxCPU = cfg.getMonitor('maxCPU') self.CPUDuration = cfg.getMonitor('CPUDuration') self.isCPUAlert = cfg.getMonitor('isCPUAlert') self.minMem = cfg.getMonitor('minMem') self.isMemAlert = cfg.getMonitor('isMemAlert') self.isPidAlert = cfg.getMonitor('isPidAlert') self.errorTimesOfPid = cfg.getMonitor('errorTimesOfPid') self.frequencyFGC = cfg.getMonitor('frequencyFGC') self.isJvmAlert = cfg.getMonitor('isJvmAlert') self.echo = cfg.getMonitor('echo') self.isDiskAlert = cfg.getMonitor('isDiskAlert') self.maxDiskUsage = cfg.getMonitor('maxDiskUsage') / 100 self.isTCP = cfg.getMonitor('isTCP') self.timeSetting = cfg.getMonitor('timeSetting') system_interval = cfg.getMonitor('system_interval') port_interval = cfg.getMonitor('port_interval') self.system_interval = max( system_interval, 1) # If the set value is less than 1, the default is 1 self.port_interval = max( port_interval, 1) # If the set value is less than 1, the default is 1 self.system_interval = self.system_interval - 1.1 # Program running time self.system_interval = max(self.system_interval, 0) self.port_interval = self.port_interval - 1.03 # Program running time self.port_interval = max(self.port_interval, 0) self.system_version = '' # system version self.cpu_info = '' self.cpu_usage = 0.0 # CPU usage self.cpu_cores = 0 # number of CPU core self.mem_usage = 0.0 # memory usage self.total_mem = 0 # totel memory, unit: G self.total_mem_100 = 0 # total memory, unit: 100*G self.nic = '' # network card self.all_disk = [] # disk number self.total_disk = 1 # total disk size, unit: M self.total_disk_h = 0 # total disk size, unit:T or G self.network_speed = cfg.getServer('nicSpeed') # bandwidth self.get_system_version() self.get_cpu_cores() self.get_total_mem() self.get_system_nic() self.get_disks() self.get_system_net_speed() self.get_total_disk_size() self.monitor_task = queue.Queue() # FIFO queue # thread pool, +2 is the need for monitoring system and registration service self.executor = ThreadPoolExecutor(self.thread_pool + 2) self.client = influxdb.InfluxDBClient( cfg.getInflux('host'), cfg.getInflux('port'), cfg.getInflux('username'), cfg.getInflux('password'), cfg.getInflux('database')) # influxdb connection self.FGC = {} # full gc times self.FGC_time = {} # full gc time self.last_cpu_io = [] # recently cpu usage self.is_java = {} # whether is java, 0 or 1 self.monitor()
def register_agent(self, disk_flag=True): """ Timed task. One is register, the other one is clean up the ports that stopped monitoring. disk_flag: Whether to send email when disk space usage is too high. :param :return: """ url = f'http://{cfg.getMaster("host")}:{cfg.getMaster("port")}/Register' header = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate", "Content-Type": "application/json; charset=UTF-8" } post_data = { 'host': self.IP, 'port': cfg.getServer('port'), 'system': self.system_version, 'cpu': self.cpu_cores, 'cpu_usage': self.cpu_usage, 'nic': self.nic, 'network_speed': self.network_speed, 'mem': round(self.total_mem, 2), 'mem_usage': self.mem_usage, 'disk_size': self.total_disk_h, 'disk_usage': self.get_used_disk_rate(), 'disks': ','.join(self.all_disk) } start_time = time.time() disk_start_time = time.time() while True: try: if time.time() - start_time > 8: # register post_data['cpu_usage'] = self.cpu_usage post_data['mem_usage'] = self.mem_usage res = requests.post(url=url, json=post_data, headers=header) logger.info( f"The result of registration is {res.content.decode('unicode_escape')}" ) start_time = time.time() if time.strftime('%H:%M') == self.timeSetting: # clean up logger.debug( 'Cleaning up the ports that stopped monitoring.') self.clear_port() if time.time() - disk_start_time > 300: disk_usage = self.get_used_disk_rate() if disk_usage: post_data[ 'disk_usage'] = disk_usage # disk space usage, without % disk_start_time = time.time() if self.maxDiskUsage < disk_usage: msg = f"The disk space usage is {disk_usage/100:.2f}%, it is too high. Server IP is {self.IP}" logger.warning(msg) if self.isDiskAlert and disk_flag: disk_flag = False # Set to False to prevent cleaning up cache continuously thread = threading.Thread(target=notification, args=(msg, )) thread.start() else: disk_flag = True time.sleep(5) except (Exception): logger.error(traceback.format_exc()) time.sleep(1)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: leeyoshinari import os import time import asyncio import traceback from aiohttp import web from common import get_ip from logger import logger, cfg from performance_monitor import PerMon, port_to_pid permon = PerMon() HOST = cfg.getServer('host') if cfg.getServer('host') else get_ip() async def index(request): """ Home, basic data can be displayed by visiting http://ip:port :param request: :return: """ return web.Response( body= f'The server system version is {permon.system_version}, {permon.cpu_info}, total memory is {permon.total_mem}G, ' f'the network card is {permon.nic}, bandwidth is {permon.network_speed}Mb/s, {len(permon.all_disk)} disks, ' f'total size of disks is {permon.total_disk_h}, disks number is {"、".join(permon.all_disk)}. ' f'If you need to stop the monitoring client, please visit http://{HOST}:{cfg.getServer("port")}/stop' )
def write_system_cpu_mem_and_register_clear(self, is_system): """ 监控系统CPU使用率、剩余内存和磁盘IO 定时任务,总共有两个,一个是向服务端注册本机,一个是清理已经停止监控的过期端口 :param is_system: 未使用 :return: """ cpu_flag = True # 控制CPU过高时是否邮件通知标志 mem_flag = True # 控制内存过低时是否邮件通知标志 echo = True # 控制是否清理缓存标志 disk_flag = True # 控制磁盘空间使用率过高时是否邮件通知标志 line = [{ 'measurement': self.IP, 'tags': { 'type': 'system' }, 'fields': { 'cpu': 0.0, 'mem': 0.0, 'rec': 0.0, 'trans': 0.0, 'net': 0.0, 'tcp': 0, 'retrans': 0.0 } }] for disk in self.all_disk: # 系统磁盘号目前发现2种格式,分别是'sda'和'sda-1',因为influxdb查询时,无法识别'-',故replace。其他格式的待验证 disk_n = disk.replace('-', '') line[0]['fields'].update({disk_n: 0.0}) line[0]['fields'].update({disk_n + '_r': 0.0}) line[0]['fields'].update({disk_n + '_w': 0.0}) disk_usage = self.get_used_disk_rate() # 注册本机参数 url = f'http://{cfg.getMaster("host")}:{cfg.getMaster("port")}/Register' header = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate", "Content-Type": "application/json; charset=UTF-8" } post_data = { 'host': self.IP, 'port': cfg.getServer('port'), 'system': self.system_version, 'cpu': self.cpu_cores, 'cpu_usage': 0.0, 'nic': self.nic, 'network_speed': self.network_speed, 'mem': round(self.total_mem, 2), 'mem_usage': 0.0, 'disk_size': self.total_disk_h, 'disk_usage': disk_usage, 'disks': ','.join(self.all_disk) } start_time = time.time() disk_start_time = time.time() while True: if time.time() - start_time > 5: # 每隔5秒注册本机 try: res = requests.post(url=url, json=post_data, headers=header) logger.info( f"客户端注册结果:{res.content.decode('unicode_escape')}") start_time = time.time() if time.strftime( '%H:%M') == self.timeSetting: # 每天定时清理一次过期的端口 logger.debug('正常清理停止监控的端口') self.clear_port() except: logger.error(traceback.format_exc()) if time.time() - disk_start_time > 300: # 每隔5分钟获取一次磁盘使用情况 disk_usage = self.get_used_disk_rate() if disk_usage: post_data['disk_usage'] = disk_usage # 磁盘使用率,不带%号 disk_start_time = time.time() if self.maxDiskUsage < disk_usage: msg = f"{self.IP} 当前系统磁盘空间使用率为{disk_usage/100:.2f}%,请注意磁盘空间" logger.warning(msg) if self.isDiskAlert and disk_flag: disk_flag = False # 标志符置为False,防止连续不断的发送邮件 thread = threading.Thread( target=notification, args=(msg, )) # 开启线程发送邮件通知 thread.start() else: disk_flag = True if self.is_system: # 开始监控 try: res = self.get_system_cpu_io_speed() # 获取系统CPU、内存和磁盘IO、带宽 if res['disk'] and res['cpu'] is not None and res[ 'mem'] is not None: for k, v in res['disk'].items(): line[0]['fields'][k] = min(v, 100.0) # 写磁盘IO数据到数据库 for k, v in res['disk_r'].items(): line[0]['fields'][k] = v for k, v in res['disk_w'].items(): line[0]['fields'][k] = v line[0]['fields']['cpu'] = res['cpu'] line[0]['fields']['mem'] = res['mem'] line[0]['fields']['rec'] = res['rece'] line[0]['fields']['trans'] = res['trans'] line[0]['fields']['net'] = res['network'] line[0]['fields']['tcp'] = res['tcp'] line[0]['fields']['retrans'] = res['retrans'] self.client.write_points(line) # 写cpu和内存到数据库 logger.info( f"system: CpuAndMem,{res['cpu']},{res['mem']},{res['disk']},{res['disk_r']},{res['disk_w']}," f"{res['rece']},{res['trans']},{res['network']}, {res['tcp']}, {res['retrans']}" ) if len(self.last_cpu_io) > self.CPUDuration: self.last_cpu_io.pop(0) self.last_cpu_io.append(res['cpu']) cpu_usage = sum(self.last_cpu_io) / len( self.last_cpu_io) post_data['cpu_usage'] = cpu_usage # CPU使用率,带%号 post_data['mem_usage'] = 1 - res[ 'mem'] / self.total_mem # 内存使用率,不带%号 if cpu_usage > self.maxCPU: msg = f'{self.IP} 当前CPU平均使用率为{cpu_usage}%,CPU使用率过高' logger.warning(msg) if self.isCPUAlert and cpu_flag: cpu_flag = False # 标志符置为False,防止连续不断的发送邮件 thread = threading.Thread( target=notification, args=(msg, )) # 开启线程发送邮件通知 thread.start() else: cpu_flag = True # 如果CPU正常,标识符重置为True if res['mem'] <= self.minMem: msg = f"{self.IP} 当前系统剩余内存为{res['mem']}G,内存过低" logger.warning(msg) if self.isMemAlert and mem_flag: mem_flag = False # 标志符置为False,防止连续不断的发送邮件 thread = threading.Thread( target=notification, args=(msg, )) # 开启线程发送邮件通知 thread.start() if self.echo and echo: echo = False # 标志符置为False,防止连续不断的清理缓存 thread = threading.Thread( target=self.clear_cache, args=()) # 开启线程清理缓存 thread.start() else: # 如果内存正常,标识符重置为True mem_flag = True echo = True except: logger.error(traceback.format_exc()) time.sleep(self.system_interval) else: time.sleep(3)
def __init__(self): self.IP = get_ip() self.thread_pool = cfg.getServer( 'threadPool') if cfg.getServer('threadPool') >= 0 else 0 self._msg = { 'port': [], 'pid': [], 'isRun': [], 'startTime': [] } # 端口号、进程号、监控状态、开始监控时间 self.is_system = cfg.getMonitor('isMonSystem') # 是否监控服务器的资源 self.error_duration = cfg.getMonitor('errorDuration') # 执行命令失败次数 self.sleepTime = cfg.getMonitor('sleepTime') self.maxCPU = cfg.getMonitor('maxCPU') self.CPUDuration = cfg.getMonitor('CPUDuration') self.isCPUAlert = cfg.getMonitor('isCPUAlert') self.minMem = cfg.getMonitor('minMem') self.isMemAlert = cfg.getMonitor('isMemAlert') self.frequencyFGC = cfg.getMonitor('frequencyFGC') self.isJvmAlert = cfg.getMonitor('isJvmAlert') self.echo = cfg.getMonitor('echo') self.isDiskAlert = cfg.getMonitor('isDiskAlert') self.maxDiskUsage = cfg.getMonitor('maxDiskUsage') / 100 self.isTCP = cfg.getMonitor('isTCP') self.timeSetting = cfg.getMonitor('timeSetting') system_interval = cfg.getMonitor('system_interval') # 每次执行监控命令的时间间隔 port_interval = cfg.getMonitor('port_interval') # 每次执行监控命令的时间间隔 self.system_interval = max(system_interval, 1) # 设置的值如果小于1,则默认为1 self.port_interval = max(port_interval, 1) self.system_interval = self.system_interval - 1.1 # 程序运行、写库时间 self.system_interval = max(self.system_interval, 0) self.port_interval = self.port_interval - 0.02 # 0.02为程序运行、写库时间 self.system_version = '' # 系统版本 self.cpu_info = '' self.cpu_cores = 0 # CPU核数 self.total_mem = 0 # 总内存,单位G self.total_mem_100 = 0 # 总内存,单位100*G,主要用于求内存占比,减少运算量 self.nic = '' # 系统正在使用的网卡 self.all_disk = [] # 磁盘号 self.total_disk = 1 # 磁盘总大小,单位M self.total_disk_h = 0 # 磁盘总大小,以人可读的方式展示,单位T或G self.network_speed = 1 # 服务器网卡带宽 self.get_system_version() self.get_cpu_cores() self.get_total_mem() self.get_system_nic() self.get_disks() self.get_system_net_speed() self.get_total_disk_size() self.monitor_task = queue.Queue() # 创建一个FIFO队列 self.executor = ThreadPoolExecutor(self.thread_pool + 1) # 创建线程池, +1是需要监控系统 self.client = influxdb.InfluxDBClient( cfg.getInflux('host'), cfg.getInflux('port'), cfg.getInflux('username'), cfg.getInflux('password'), cfg.getInflux('database')) # 创建数据库连接 self.FGC = {} # 每个端口的full gc次数 self.FGC_time = {} # 每个端口每次full gc的时间 self.last_cpu_io = [] # 最近一段时间的cpu的值,约100s self.is_java = {} # 监控的端口是否是java服务,0 or 1 self.monitor()