示例#1
0
    def test_get_and_stop_and_kill_session(self):
        session = self.ctx.get_session()
        id = session.get_session_id()

        session.stop()

        from eggroll.core.session import ErSession
        dead_session = ErSession(id)
        dead_session.stop()

        dead_session = ErSession(id)
        dead_session.kill()
示例#2
0
def check_actual_max_threads():
    def getMemInfo(fn):
        def query_cmd(cmd):
            result = True
            p = subprocess.Popen(
                cmd, stdout=subprocess.PIPE,
                shell=True).communicate()[0].decode().strip().split('\n')
            print(p)
            for i in p:
                if int(i) < 65535:
                    result = False
            return result

        mem = psutil.virtual_memory()
        mem_total = round2(mem.total)
        mem_used = round2(mem.used)
        mem_used_per = str(round(mem.percent)) + '%'

        swap_mem = psutil.swap_memory()
        swap_total = round2(swap_mem.total)
        swap_used = round2(swap_mem.used)
        swap_use_per = str(round(swap_mem.percent)) + '%'

        data_disk = psutil.disk_usage('/data')
        disk_total = round2(data_disk.total)
        disk_used = round2(data_disk.used)
        disk_per = str(round(data_disk.percent)) + '%'

        mem_info = {}
        mem_info["MemTotal"] = mem_total
        mem_info["MemUsed"] = mem_used
        mem_info["MemUsedPer"] = mem_used_per

        mem_info["SwapTotal"] = swap_total
        mem_info["SwapUsed"] = swap_used
        mem_info["SwapUsePer"] = swap_use_per

        mem_info["DiskTotal"] = disk_total
        mem_info["DiskUsed"] = disk_used
        mem_info["DiskPer"] = disk_per

        mem_info["/proc/sys/kernel/threads-max"] = query_cmd(
            "cat /proc/sys/kernel/threads-max")
        mem_info["/etc/sysctl.conf"] = query_cmd(
            "grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/kernel/pid_max"] = query_cmd(
            "cat /proc/sys/kernel/pid_max")
        mem_info["/proc/sys/vm/max_map_count"] = query_cmd(
            "cat /proc/sys/vm/max_map_count")

        mem_info["/etc/security/limits.conf"] = query_cmd(
            "cat /etc/security/limits.conf | grep nofile | awk '{print $4}'")
        mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd(
            "cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'"
        )
        mem_info["/etc/sysctl.conf"] = query_cmd(
            "grep fs.file-max  /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/fs/file-max"] = query_cmd(
            "cat /proc/sys/fs/file-max")

        return mem_info

    session = ErSession(
        options={"eggroll.session.processors.per.node": args.nodes})
    try:
        ctx = RollPairContext(session)
        rp = ctx.parallelize(str_generator(row_limit=1000),
                             options={'total_partitions': args.partitions})
        result = rp.with_stores(func=getMemInfo)
        print_green(str(datetime.datetime.now()))
        #print(json.dumps(result, indent=1))
        for node in result:
            print_green("==============This is node :" + str(node[0]) +
                        "================")
            print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] +
                         ", MemUsed:" + node[1]["MemUsed"] + ", MemUsedPer:" +
                         node[1]["MemUsedPer"])
            print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] +
                         ", SwapUsed:" + node[1]["SwapUsed"] +
                         ", SwapUsePer:" + node[1]["SwapUsePer"])
            print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] +
                         ", DiskUsed:" + node[1]["DiskUsed"] + ", DiskPer:" +
                         node[1]["DiskPer"])
            print_green(
                "--------Max user processes and max file count--------")
            for key in [
                    "/proc/sys/kernel/threads-max", "/etc/sysctl.conf",
                    "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count",
                    "/etc/security/limits.conf",
                    "/etc/security/limits.d/80-nofile.conf",
                    "/etc/sysctl.conf", "/proc/sys/fs/file-max"
            ]:
                if node[1][key]:
                    print_green("[OK] " + key + " is ok.")
                else:
                    print_red("[ERROR] please check " + key +
                              ", no less than 65535.")
            print("\n")
    finally:
        session.kill()
示例#3
0
文件: env_check.py 项目: xkazm/FATE
def check_actual_max_threads():
    def getMemInfo(fn):
        def query_cmd(cmd):
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n')
            return p[0]
 
        def get_host_ip():
            try:
                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                s.connect(('8.8.8.8', 80))
                ip = s.getsockname()[0]
            finally:
                s.close()
            return ip

        mem = psutil.virtual_memory()
        mem_total = round2(mem.total)
        mem_used = round2(mem.used)
        mem_used_per = str(round(mem.percent)) + '%'

        swap_mem = psutil.swap_memory()
        swap_total = round2(swap_mem.total)
        swap_used = round2(swap_mem.used)
        swap_use_per = str(round(swap_mem.percent)) + '%'

        data_disk = psutil.disk_usage('/data')
        disk_total = round2(data_disk.total)
        disk_used = round2(data_disk.used)
        disk_per = str(round(data_disk.percent)) + '%'

        mem_info = {}
        mem_info["Ip"] = get_host_ip()
        mem_info["MemTotal"] = mem_total
        mem_info["MemUsed"] = mem_used
        mem_info["MemUsedPCT"] = mem_used_per

        mem_info["SwapTotal"] = swap_total
        mem_info["SwapUsed"] = swap_used
        mem_info["SwapUsePCT"] = swap_use_per

        mem_info["DiskTotal"] = disk_total
        mem_info["DiskUsed"] = disk_used
        mem_info["DiskUsedPCT"] = disk_per

        mem_info["/proc/sys/kernel/threads-max"] = query_cmd("cat /proc/sys/kernel/threads-max")
        mem_info["/etc/sysctl.conf"] = query_cmd("grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/kernel/pid_max"] = query_cmd("cat /proc/sys/kernel/pid_max")
        mem_info["/proc/sys/vm/max_map_count"] = query_cmd("cat /proc/sys/vm/max_map_count")

        mem_info["/etc/security/limits.conf"] = query_cmd("cat /etc/security/limits.conf | grep nofile | awk '{print $4}'")
        mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd("cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'")
        mem_info["/etc/sysctl.conf"] = query_cmd("grep fs.file-max  /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/fs/file-max"] = query_cmd("cat /proc/sys/fs/file-max")

        mem_info["CurrentUseProcesses"] = query_cmd("pstree -p `ps -e |grep egg_pair |awk '{print $1}'` |wc -l")
        mem_info["NodeProcessors"] = query_cmd("grep eggroll.session.processors.per.node ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'")
        mem_info["PoolSize"] = query_cmd("grep eggroll.rollpair.eggpair.server.executor.pool.max.size ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'")

        rollsite_pid = query_cmd("ps aux | grep ${EGGROLL_HOME} | grep com.webank.eggroll.rollsite.Proxy | grep -v grep | awk '{print $2}'")
        if rollsite_pid:
            rollsite_used_memory = psutil.Process(int(rollsite_pid)).memory_info().rss
            myfile = open(sys.path[1] + '/../../../conf/eggroll.properties')
            properties = myfile.read()
            jvm_options = re.findall(r"(?<=MaxHeapSize=).*?(?=G)", properties)
            if len(jvm_options):
                rollsite_total_memory = int(jvm_options[0]) * 1024 * 1024 * 1024
            else:
                rollsite_total_memory = mem.total
            myfile.close()

            mem_info["RollsiteUsedPercent"] = '{:.2%}'.format(rollsite_used_memory / (rollsite_total_memory * 4))
        else:
            mem_info["RollsiteUsedPercent"] = 0


        return mem_info

    session = ErSession(options={"eggroll.session.processors.per.node": args.nodes})
    try:
        ctx = RollPairContext(session)
        rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions})
        result = rp.with_stores(func=getMemInfo)
        print_green(str(datetime.datetime.now()))
        #print(json.dumps(result, indent=1))
        for node in result:
            print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================")
            print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] + "G, MemUsed:" + node[1]["MemUsed"] + "G, MemUsedPCT:" + node[1]["MemUsedPCT"])
            if float(node[1]["SwapTotal"]) < 128:
                print_red("[ERROR] The swap memory is:" + node[1]["SwapTotal"] + "G, no less than 128G.")
            else:
                print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] + "G, SwapUsed:" + node[1]["SwapUsed"] + "G, SwapUsePCT:" + node[1]["SwapUsePCT"])
            print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] + "G, DiskUsed:" + node[1]["DiskUsed"] + "G, DiskUsedPCT:" + node[1]["DiskUsedPCT"])
            print_green("--------------Max user processes and max file count----------------------------------------")
            for key in ["/proc/sys/kernel/threads-max", "/etc/sysctl.conf", "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count", "/etc/security/limits.conf", "/etc/security/limits.d/80-nofile.conf", "/etc/sysctl.conf", "/proc/sys/fs/file-max"]:
                if int(node[1][key]) > 65535:
                    print_green("[OK] " + key + " = " + node[1][key])
                else:
                    print_red("[ERROR] please check " + key + " = " + node[1][key] + ", no less than 65535.")
            print_green("--------------Thread count check-----------------------------------------------------------")
            if len(node[1]["PoolSize"]) == 0:
                node[1]["PoolSize"] = 500
            if int(node[1]["CurrentUseProcesses"]) < int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]):
                print_green("[OK] The thread count = %s, the total processes = %s * %s = %i" % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"])))
            else:
                print_red("[ERROR] The thread count = %s, the total processes = %s * %s = %i. eggroll.rollpair.eggpair.server.executor.pool.max.size is not enough, turn it up." % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"])))
            if node[1]["RollsiteUsedPercent"] != 0:
                print_green("----------Rollsite memory use percent--------------------------------------------------")
                print_yellow("[WARNING] rollsite memory use: " + node[1]["RollsiteUsedPercent"])
            print("\n")
    finally:
        session.kill()
示例#4
0
def check_actual_max_threads():
    def getMemInfo(fn):
        def query_cmd(cmd):
            p = subprocess.Popen(
                cmd, stdout=subprocess.PIPE,
                shell=True).communicate()[0].decode().strip().split('\n')
            return p[0]

        def get_host_ip():
            try:
                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                s.connect(('8.8.8.8', 80))
                ip = s.getsockname()[0]
            finally:
                s.close()
            return ip

        fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py"
        mem_info = {}
        mem_info["Ip"] = get_host_ip()
        eggroll_home = query_cmd("echo $EGGROLL_HOME")
        route_file = eggroll_home + "/conf/route_table.json"
        f = open(route_file, encoding='utf-8')
        mem_info["route_table"] = json.load(f)
        mem_info["services"] = [
            'ClusterManagerBootstrap', 'NodeManagerBootstrap', 'rollsite',
            'fate_flow_server.py', 'fateboard', 'mysql'
        ]
        mem_info["job_run"] = query_cmd(
            "if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi"
            % (fate_flow_client, fate_flow_client))
        mem_info["job_wait"] = query_cmd(
            "if [ -f %s ];then python %s -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi"
            % (fate_flow_client, fate_flow_client))
        mem_info["job_thread"] = []
        mem_info["jobs"] = query_cmd(
            "array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}"
            % (fate_flow_client))
        mem_info["job_mem"] = []
        for job_id in mem_info["jobs"]:
            mem_info["job_thread"] = query_cmd(
                "ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" %
                (job_id))
            mem_info["job_mem"] = query_cmd(
                "ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'"
                % (job_id))
        mem_info["server_mem"] = {}
        mem_info["thread"] = {}
        for service in mem_info["services"]:
            mem_info["thread"][service] = query_cmd(
                "ps -ef |grep %s |grep -v grep |wc -l" % (service))
            mem_info["server_mem"][service] = str(
                query_cmd(
                    "ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'"
                    % (service)))
        return mem_info

    session = ErSession(
        options={"eggroll.session.processors.per.node": args.nodes})
    try:
        ctx = RollPairContext(session)
        rp = ctx.parallelize(str_generator(row_limit=1000),
                             options={'total_partitions': args.partitions})
        result = rp.with_stores(func=getMemInfo)
        print_green(str(datetime.datetime.now()))
        for node in result:
            print_green("==============This is node " + str(node[0]) + ":" +
                        node[1]["Ip"] +
                        "===========================================")
            print_green(
                "-------------default route check-------------------------------------------------------"
            )
            route_table_dict = node[1]["route_table"]
            if 'default' not in route_table_dict['route_table']:
                print_red(
                    "[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!"
                )
            else:
                try:
                    ip = route_table_dict['route_table']['default']['default'][
                        0]['ip']
                    port = route_table_dict['route_table']['default'][
                        'default'][0]['port']
                    print_green("[OK] eggroll route configured!")
                    print_green("exchange ip:{}, exchange port:{}".format(
                        ip, port))
                except KeyError:
                    print_red(
                        "[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!"
                    )

            print_green(
                "--------------fate service check-------------------------------------------------------"
            )
            for server in node[1]["services"]:
                if int(node[1]["thread"][server]) > 0:
                    print_green(
                        "[OK] the " + server.ljust(23) +
                        " service is running , number of processes is : " +
                        str(node[1]["thread"][server]) + "; used memory : " +
                        str(node[1]["server_mem"][server]) + "KB.")
                else:
                    print_yellow(
                        "[WARNING] the " + server +
                        " service not running, please check service status.")

            print_green(
                "--------------fate_flow jobs process and mem info check--------------------------------------------------"
            )
            if int(node[1]["job_run"]) == -1:
                print_red(
                    "[ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!"
                )
            else:
                print_green("[OK] Number of tasks running is " +
                            node[1]["job_run"])
                print_green("[OK] Number of tasks waiting is " +
                            node[1]["job_wait"])
                if int(node[1]["job_run"]) > 0:
                    for job_id in node[1]["jobs"].split(" "):
                        print_green("[OK] running task job_id : " + job_id +
                                    ", number of egg_pair processes is : " +
                                    str(node[1]["job_thread"]) +
                                    "; used memory : " +
                                    str(node[1]["job_mem"]) + "KB.")

            print("\n")
    finally:
        session.kill()