예제 #1
0
async def main():
    app = web.Application()

    app.router.add_route('GET', '/', index)
    app.router.add_route('GET', '/checkStatus', check_status)
    app.router.add_route('POST', '/runMonitor', run_monitor)
    app.router.add_route('POST', '/getMonitor', get_monitor)
    app.router.add_route('GET', '/getGC/{port}', get_gc)

    runner = web.AppRunner(app)
    await runner.setup()
    site = web.TCPSite(runner, cfg.getServer('host'), cfg.getServer('port'))
    await site.start()
예제 #2
0
async def main():
    app = web.Application()
    aiohttp_jinja2.setup(
        app, loader=jinja2.FileSystemLoader('templates'))  # 将模板添加到搜索路径
    app.router.add_static('/static/',
                          path=os.path.join(
                              os.path.dirname(os.path.abspath(__file__)),
                              'static'))  # 将静态文件添加到搜索路径

    app.router.add_route('GET', '/', index)
    app.router.add_route('GET', '/startMonitor', start_monitor)
    app.router.add_route('GET', '/getMonitor/{host}', get_monitor)
    app.router.add_route('GET', '/Visualize', visualize)
    app.router.add_route('GET', '/getPortAndDisk/{host}', get_port_disk)

    app.router.add_route('POST', '/Register', registers)
    app.router.add_route('POST', '/runMonitor', run_monitor)
    app.router.add_route('POST', '/plotMonitor', plot_monitor)
    app.router.add_route('POST', '/Notification', notice)

    runner = web.AppRunner(app)
    await runner.setup()
    # site = web.TCPSite(runner, cfg.getServer('host'), cfg.getServer('port'))
    site = web.TCPSite(runner, get_ip(), cfg.getServer('port'))
    await site.start()
예제 #3
0
async def stop_monitor(request):
    pid = port_to_pid(cfg.getServer('port'))
    if pid:
        _ = os.popen(f'kill -9 {pid}')
        logger.info('Stop the client successfully!')
        return web.Response(body='Stop the client successfully!')
    else:
        return web.Response(body='Client is not running!')
예제 #4
0
async def get_monitor(request):
    """
	获取监控端口列表
	:param request:
	:return:
	"""
    data = await request.json()
    host = data.get('host')
    if host == cfg.getServer('host'):
        msg = permon.start
        if len(msg['port']) > 0:  # 是否监控过端口
            data = {'host': [host] * len(msg['port'])}
            data.update(msg)
            return web.json_response({'code': 0, 'msg': '操作成功', 'data': data})
        else:
            logger.error('暂未监控任何端口')
            return web.json_response({
                'code': 1,
                'msg': '暂未监控任何端口',
                'data': {
                    'host': host,
                    'port': None,
                    'pid': None
                }
            })
    else:
        logger.error('请求参数异常')
        return web.json_response({
            'code': 2,
            'msg': '请求参数异常',
            'data': {
                'host': host,
                'port': None,
                'pid': None
            }
        })
def notification(msg):
    """
    发送邮件通知
    :param msg: 邮件正文信息
    :return:
    """
    url = f'http://{cfg.getMaster("host")}:{cfg.getMaster("port")}/Notification'

    header = {
        "Accept": "application/json, text/plain, */*",
        "Accept-Encoding": "gzip, deflate",
        "Content-Type": "application/json; charset=UTF-8"
    }
    post_data = {'host': cfg.getServer('host'), 'msg': msg}
    logger.debug(f'发送邮件信息的内容:{msg}')
    res = requests.post(url=url, json=post_data, headers=header)
    if res.status_code == 200:
        response = json.loads(res.content.decode())
        if response['code'] == 0:
            logger.info('邮件发送成功')
        else:
            logger.error(response['msg'])
    else:
        logger.error('邮件发送失败')
예제 #6
0
async def run_monitor(request):
    """
	开始监控接口
	:param request:
	:return:
	"""
    try:
        data = await request.json()
        host = data.get('host')
        port = data.get('port')
        network = data.get('net')
        is_run = data.get('isRun')

        if host == cfg.getServer('host'):
            if port:
                pid = port_to_pid(port)  # 根据端口号查询进程号
                if pid is None:
                    logger.warning(f"端口 {port} 未启动!")
                    return web.json_response({
                        'code': 1,
                        'msg': f"端口 {port} 未启动!",
                        'data': {
                            'host': host,
                            'port': port,
                            'pid': None
                        }
                    })

                if is_run == '0':  # 如果是停止监控
                    if port in permon.stop['port']:  # 端口是否监控过
                        permon.stop = {
                            'port': port,
                            'pid': pid,
                            'net': network,
                            'is_run': 0
                        }
                        logger.info('停止监控成功!')
                        return web.json_response({
                            'code': 0,
                            'msg': '停止监控成功!',
                            'data': {
                                'host': host,
                                'port': port,
                                'pid': pid
                            }
                        })
                    else:
                        logger.warning(f"端口 {port} 未监控,请先监控!")
                        return web.json_response({
                            'code': 1,
                            'msg': f"端口 {port} 未监控,请先监控!",
                            'data': {
                                'host': host,
                                'port': port,
                                'pid': pid
                            }
                        })

                if is_run == '1':  # 如果是开始监控
                    permon.start = {'port': port, 'pid': pid, 'is_run': 1}
                    logger.info('开始监控成功!')
                    return web.json_response({
                        'code': 0,
                        'msg': '开始监控成功!',
                        'data': {
                            'host': host,
                            'port': port,
                            'pid': pid
                        }
                    })

            else:
                logger.error('请求参数异常')
                return web.json_response({
                    'code': 2,
                    'msg': '请求参数异常',
                    'data': {
                        'host': host,
                        'port': port,
                        'pid': None
                    }
                })
        else:
            logger.error('请求参数异常')
            return web.json_response({
                'code': 2,
                'msg': '请求参数异常',
                'data': {
                    'host': host,
                    'port': port,
                    'pid': None
                }
            })

    except Exception as err:
        logger.error(err)
        logger.error(traceback.format_exc())
        return web.json_response({
            'code': 2,
            'msg': err,
            'data': {
                'host': cfg.getServer('host'),
                'port': None,
                'pid': None
            }
        })
예제 #7
0
    def __init__(self):
        self.check_sysstat_version()
        self.IP = get_ip()
        self.thread_pool = cfg.getServer(
            'threadPool') if cfg.getServer('threadPool') >= 0 else 0
        self._msg = {
            'port': [],
            'pid': [],
            'isRun': [],
            'startTime': []
        }  # port、pid、status、startTime
        self.is_system = cfg.getMonitor(
            'isMonSystem')  # Whether to monitor the server system
        self.error_times = cfg.getMonitor('errorTimes')
        self.sleepTime = cfg.getMonitor('sleepTime')
        self.maxCPU = cfg.getMonitor('maxCPU')
        self.CPUDuration = cfg.getMonitor('CPUDuration')
        self.isCPUAlert = cfg.getMonitor('isCPUAlert')
        self.minMem = cfg.getMonitor('minMem')
        self.isMemAlert = cfg.getMonitor('isMemAlert')
        self.isPidAlert = cfg.getMonitor('isPidAlert')
        self.errorTimesOfPid = cfg.getMonitor('errorTimesOfPid')
        self.frequencyFGC = cfg.getMonitor('frequencyFGC')
        self.isJvmAlert = cfg.getMonitor('isJvmAlert')
        self.echo = cfg.getMonitor('echo')
        self.isDiskAlert = cfg.getMonitor('isDiskAlert')
        self.maxDiskUsage = cfg.getMonitor('maxDiskUsage') / 100
        self.isTCP = cfg.getMonitor('isTCP')
        self.timeSetting = cfg.getMonitor('timeSetting')

        system_interval = cfg.getMonitor('system_interval')
        port_interval = cfg.getMonitor('port_interval')
        self.system_interval = max(
            system_interval,
            1)  # If the set value is less than 1, the default is 1
        self.port_interval = max(
            port_interval,
            1)  # If the set value is less than 1, the default is 1
        self.system_interval = self.system_interval - 1.1  # Program running time
        self.system_interval = max(self.system_interval, 0)
        self.port_interval = self.port_interval - 1.03  # Program running time
        self.port_interval = max(self.port_interval, 0)

        self.system_version = ''  # system version
        self.cpu_info = ''
        self.cpu_usage = 0.0  # CPU usage
        self.cpu_cores = 0  # number of CPU core
        self.mem_usage = 0.0  # memory usage
        self.total_mem = 0  # totel memory, unit: G
        self.total_mem_100 = 0  # total memory, unit: 100*G
        self.nic = ''  # network card
        self.all_disk = []  # disk number
        self.total_disk = 1  # total disk size, unit: M
        self.total_disk_h = 0  # total disk size, unit:T or G
        self.network_speed = cfg.getServer('nicSpeed')  # bandwidth

        self.get_system_version()
        self.get_cpu_cores()
        self.get_total_mem()
        self.get_system_nic()
        self.get_disks()
        self.get_system_net_speed()
        self.get_total_disk_size()

        self.monitor_task = queue.Queue()  # FIFO queue
        # thread pool, +2 is the need for monitoring system and registration service
        self.executor = ThreadPoolExecutor(self.thread_pool + 2)
        self.client = influxdb.InfluxDBClient(
            cfg.getInflux('host'), cfg.getInflux('port'),
            cfg.getInflux('username'), cfg.getInflux('password'),
            cfg.getInflux('database'))  # influxdb connection

        self.FGC = {}  # full gc times
        self.FGC_time = {}  # full gc time
        self.last_cpu_io = []  # recently cpu usage
        self.is_java = {}  # whether is java, 0 or 1

        self.monitor()
예제 #8
0
    def register_agent(self, disk_flag=True):
        """
        Timed task. One is register, the other one is clean up the ports that stopped monitoring.
        disk_flag: Whether to send email when disk space usage is too high.
        :param
        :return:
        """
        url = f'http://{cfg.getMaster("host")}:{cfg.getMaster("port")}/Register'
        header = {
            "Accept": "application/json, text/plain, */*",
            "Accept-Encoding": "gzip, deflate",
            "Content-Type": "application/json; charset=UTF-8"
        }
        post_data = {
            'host': self.IP,
            'port': cfg.getServer('port'),
            'system': self.system_version,
            'cpu': self.cpu_cores,
            'cpu_usage': self.cpu_usage,
            'nic': self.nic,
            'network_speed': self.network_speed,
            'mem': round(self.total_mem, 2),
            'mem_usage': self.mem_usage,
            'disk_size': self.total_disk_h,
            'disk_usage': self.get_used_disk_rate(),
            'disks': ','.join(self.all_disk)
        }
        start_time = time.time()
        disk_start_time = time.time()

        while True:
            try:
                if time.time() - start_time > 8:  # register
                    post_data['cpu_usage'] = self.cpu_usage
                    post_data['mem_usage'] = self.mem_usage
                    res = requests.post(url=url,
                                        json=post_data,
                                        headers=header)
                    logger.info(
                        f"The result of registration is {res.content.decode('unicode_escape')}"
                    )
                    start_time = time.time()
                    if time.strftime('%H:%M') == self.timeSetting:  # clean up
                        logger.debug(
                            'Cleaning up the ports that stopped monitoring.')
                        self.clear_port()

                if time.time() - disk_start_time > 300:
                    disk_usage = self.get_used_disk_rate()
                    if disk_usage:
                        post_data[
                            'disk_usage'] = disk_usage  # disk space usage, without %
                        disk_start_time = time.time()

                        if self.maxDiskUsage < disk_usage:
                            msg = f"The disk space usage is {disk_usage/100:.2f}%, it is too high. Server IP is {self.IP}"
                            logger.warning(msg)
                            if self.isDiskAlert and disk_flag:
                                disk_flag = False  # Set to False to prevent cleaning up cache continuously
                                thread = threading.Thread(target=notification,
                                                          args=(msg, ))
                                thread.start()
                        else:
                            disk_flag = True

                time.sleep(5)

            except (Exception):
                logger.error(traceback.format_exc())
                time.sleep(1)
예제 #9
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: leeyoshinari

import os
import time
import asyncio
import traceback
from aiohttp import web
from common import get_ip
from logger import logger, cfg
from performance_monitor import PerMon, port_to_pid

permon = PerMon()
HOST = cfg.getServer('host') if cfg.getServer('host') else get_ip()


async def index(request):
    """
    Home, basic data can be displayed by visiting http://ip:port
    :param request:
    :return:
    """
    return web.Response(
        body=
        f'The server system version is {permon.system_version}, {permon.cpu_info}, total memory is {permon.total_mem}G, '
        f'the network card is {permon.nic}, bandwidth is {permon.network_speed}Mb/s, {len(permon.all_disk)} disks, '
        f'total size of disks is {permon.total_disk_h}, disks number is {"、".join(permon.all_disk)}. '
        f'If you need to stop the monitoring client, please visit http://{HOST}:{cfg.getServer("port")}/stop'
    )
예제 #10
0
    def write_system_cpu_mem_and_register_clear(self, is_system):
        """
        监控系统CPU使用率、剩余内存和磁盘IO
        定时任务,总共有两个,一个是向服务端注册本机,一个是清理已经停止监控的过期端口
        :param is_system: 未使用
        :return:
        """
        cpu_flag = True  # 控制CPU过高时是否邮件通知标志
        mem_flag = True  # 控制内存过低时是否邮件通知标志
        echo = True  # 控制是否清理缓存标志
        disk_flag = True  # 控制磁盘空间使用率过高时是否邮件通知标志

        line = [{
            'measurement': self.IP,
            'tags': {
                'type': 'system'
            },
            'fields': {
                'cpu': 0.0,
                'mem': 0.0,
                'rec': 0.0,
                'trans': 0.0,
                'net': 0.0,
                'tcp': 0,
                'retrans': 0.0
            }
        }]
        for disk in self.all_disk:
            # 系统磁盘号目前发现2种格式,分别是'sda'和'sda-1',因为influxdb查询时,无法识别'-',故replace。其他格式的待验证
            disk_n = disk.replace('-', '')
            line[0]['fields'].update({disk_n: 0.0})
            line[0]['fields'].update({disk_n + '_r': 0.0})
            line[0]['fields'].update({disk_n + '_w': 0.0})

        disk_usage = self.get_used_disk_rate()
        # 注册本机参数
        url = f'http://{cfg.getMaster("host")}:{cfg.getMaster("port")}/Register'
        header = {
            "Accept": "application/json, text/plain, */*",
            "Accept-Encoding": "gzip, deflate",
            "Content-Type": "application/json; charset=UTF-8"
        }
        post_data = {
            'host': self.IP,
            'port': cfg.getServer('port'),
            'system': self.system_version,
            'cpu': self.cpu_cores,
            'cpu_usage': 0.0,
            'nic': self.nic,
            'network_speed': self.network_speed,
            'mem': round(self.total_mem, 2),
            'mem_usage': 0.0,
            'disk_size': self.total_disk_h,
            'disk_usage': disk_usage,
            'disks': ','.join(self.all_disk)
        }
        start_time = time.time()
        disk_start_time = time.time()

        while True:
            if time.time() - start_time > 5:  # 每隔5秒注册本机
                try:
                    res = requests.post(url=url,
                                        json=post_data,
                                        headers=header)
                    logger.info(
                        f"客户端注册结果:{res.content.decode('unicode_escape')}")
                    start_time = time.time()
                    if time.strftime(
                            '%H:%M') == self.timeSetting:  # 每天定时清理一次过期的端口
                        logger.debug('正常清理停止监控的端口')
                        self.clear_port()
                except:
                    logger.error(traceback.format_exc())

            if time.time() - disk_start_time > 300:  # 每隔5分钟获取一次磁盘使用情况
                disk_usage = self.get_used_disk_rate()
                if disk_usage:
                    post_data['disk_usage'] = disk_usage  # 磁盘使用率,不带%号
                    disk_start_time = time.time()

                    if self.maxDiskUsage < disk_usage:
                        msg = f"{self.IP} 当前系统磁盘空间使用率为{disk_usage/100:.2f}%,请注意磁盘空间"
                        logger.warning(msg)
                        if self.isDiskAlert and disk_flag:
                            disk_flag = False  # 标志符置为False,防止连续不断的发送邮件
                            thread = threading.Thread(
                                target=notification,
                                args=(msg, ))  # 开启线程发送邮件通知
                            thread.start()
                    else:
                        disk_flag = True

            if self.is_system:  # 开始监控
                try:
                    res = self.get_system_cpu_io_speed()  # 获取系统CPU、内存和磁盘IO、带宽

                    if res['disk'] and res['cpu'] is not None and res[
                            'mem'] is not None:
                        for k, v in res['disk'].items():
                            line[0]['fields'][k] = min(v, 100.0)  # 写磁盘IO数据到数据库

                        for k, v in res['disk_r'].items():
                            line[0]['fields'][k] = v

                        for k, v in res['disk_w'].items():
                            line[0]['fields'][k] = v

                        line[0]['fields']['cpu'] = res['cpu']
                        line[0]['fields']['mem'] = res['mem']
                        line[0]['fields']['rec'] = res['rece']
                        line[0]['fields']['trans'] = res['trans']
                        line[0]['fields']['net'] = res['network']
                        line[0]['fields']['tcp'] = res['tcp']
                        line[0]['fields']['retrans'] = res['retrans']
                        self.client.write_points(line)  # 写cpu和内存到数据库
                        logger.info(
                            f"system: CpuAndMem,{res['cpu']},{res['mem']},{res['disk']},{res['disk_r']},{res['disk_w']},"
                            f"{res['rece']},{res['trans']},{res['network']}, {res['tcp']}, {res['retrans']}"
                        )

                        if len(self.last_cpu_io) > self.CPUDuration:
                            self.last_cpu_io.pop(0)

                        self.last_cpu_io.append(res['cpu'])
                        cpu_usage = sum(self.last_cpu_io) / len(
                            self.last_cpu_io)
                        post_data['cpu_usage'] = cpu_usage  # CPU使用率,带%号
                        post_data['mem_usage'] = 1 - res[
                            'mem'] / self.total_mem  # 内存使用率,不带%号

                        if cpu_usage > self.maxCPU:
                            msg = f'{self.IP} 当前CPU平均使用率为{cpu_usage}%,CPU使用率过高'
                            logger.warning(msg)
                            if self.isCPUAlert and cpu_flag:
                                cpu_flag = False  # 标志符置为False,防止连续不断的发送邮件
                                thread = threading.Thread(
                                    target=notification,
                                    args=(msg, ))  # 开启线程发送邮件通知
                                thread.start()
                        else:
                            cpu_flag = True  # 如果CPU正常,标识符重置为True

                        if res['mem'] <= self.minMem:
                            msg = f"{self.IP} 当前系统剩余内存为{res['mem']}G,内存过低"
                            logger.warning(msg)
                            if self.isMemAlert and mem_flag:
                                mem_flag = False  # 标志符置为False,防止连续不断的发送邮件
                                thread = threading.Thread(
                                    target=notification,
                                    args=(msg, ))  # 开启线程发送邮件通知
                                thread.start()

                            if self.echo and echo:
                                echo = False  # 标志符置为False,防止连续不断的清理缓存
                                thread = threading.Thread(
                                    target=self.clear_cache,
                                    args=())  # 开启线程清理缓存
                                thread.start()

                        else:
                            # 如果内存正常,标识符重置为True
                            mem_flag = True
                            echo = True

                except:
                    logger.error(traceback.format_exc())

                time.sleep(self.system_interval)
            else:
                time.sleep(3)
예제 #11
0
    def __init__(self):
        self.IP = get_ip()
        self.thread_pool = cfg.getServer(
            'threadPool') if cfg.getServer('threadPool') >= 0 else 0
        self._msg = {
            'port': [],
            'pid': [],
            'isRun': [],
            'startTime': []
        }  # 端口号、进程号、监控状态、开始监控时间
        self.is_system = cfg.getMonitor('isMonSystem')  # 是否监控服务器的资源
        self.error_duration = cfg.getMonitor('errorDuration')  # 执行命令失败次数
        self.sleepTime = cfg.getMonitor('sleepTime')
        self.maxCPU = cfg.getMonitor('maxCPU')
        self.CPUDuration = cfg.getMonitor('CPUDuration')
        self.isCPUAlert = cfg.getMonitor('isCPUAlert')
        self.minMem = cfg.getMonitor('minMem')
        self.isMemAlert = cfg.getMonitor('isMemAlert')
        self.frequencyFGC = cfg.getMonitor('frequencyFGC')
        self.isJvmAlert = cfg.getMonitor('isJvmAlert')
        self.echo = cfg.getMonitor('echo')
        self.isDiskAlert = cfg.getMonitor('isDiskAlert')
        self.maxDiskUsage = cfg.getMonitor('maxDiskUsage') / 100
        self.isTCP = cfg.getMonitor('isTCP')
        self.timeSetting = cfg.getMonitor('timeSetting')

        system_interval = cfg.getMonitor('system_interval')  # 每次执行监控命令的时间间隔
        port_interval = cfg.getMonitor('port_interval')  # 每次执行监控命令的时间间隔
        self.system_interval = max(system_interval, 1)  # 设置的值如果小于1,则默认为1
        self.port_interval = max(port_interval, 1)
        self.system_interval = self.system_interval - 1.1  # 程序运行、写库时间
        self.system_interval = max(self.system_interval, 0)
        self.port_interval = self.port_interval - 0.02  # 0.02为程序运行、写库时间

        self.system_version = ''  # 系统版本
        self.cpu_info = ''
        self.cpu_cores = 0  # CPU核数
        self.total_mem = 0  # 总内存,单位G
        self.total_mem_100 = 0  # 总内存,单位100*G,主要用于求内存占比,减少运算量
        self.nic = ''  # 系统正在使用的网卡
        self.all_disk = []  # 磁盘号
        self.total_disk = 1  # 磁盘总大小,单位M
        self.total_disk_h = 0  # 磁盘总大小,以人可读的方式展示,单位T或G
        self.network_speed = 1  # 服务器网卡带宽

        self.get_system_version()
        self.get_cpu_cores()
        self.get_total_mem()
        self.get_system_nic()
        self.get_disks()
        self.get_system_net_speed()
        self.get_total_disk_size()

        self.monitor_task = queue.Queue()  # 创建一个FIFO队列
        self.executor = ThreadPoolExecutor(self.thread_pool +
                                           1)  # 创建线程池, +1是需要监控系统
        self.client = influxdb.InfluxDBClient(
            cfg.getInflux('host'), cfg.getInflux('port'),
            cfg.getInflux('username'), cfg.getInflux('password'),
            cfg.getInflux('database'))  # 创建数据库连接

        self.FGC = {}  # 每个端口的full gc次数
        self.FGC_time = {}  # 每个端口每次full gc的时间
        self.last_cpu_io = []  # 最近一段时间的cpu的值,约100s
        self.is_java = {}  # 监控的端口是否是java服务,0 or 1

        self.monitor()