def test_get_and_stop_and_kill_session(self): session = self.ctx.get_session() id = session.get_session_id() session.stop() from eggroll.core.session import ErSession dead_session = ErSession(id) dead_session.stop() dead_session = ErSession(id) dead_session.kill()
def get_debug_test_context(is_standalone=False, manager_port=4670, egg_port=20001, transfer_port=20002, session_id='testing'): manager_port = manager_port egg_ports = [egg_port] egg_transfer_ports = [transfer_port] self_server_node_id = 2 options = {} if is_standalone: options[SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE] = "standalone" options[TransferConfKeys.CONFKEY_TRANSFER_SERVICE_HOST] = "127.0.0.1" options[TransferConfKeys.CONFKEY_TRANSFER_SERVICE_PORT] = str(transfer_port) options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = str(manager_port) options[NodeManagerConfKeys.CONFKEY_NODE_MANAGER_PORT] = str(manager_port) egg = ErProcessor(id=1, server_node_id=self_server_node_id, processor_type=ProcessorTypes.EGG_PAIR, status=ProcessorStatus.RUNNING, command_endpoint=ErEndpoint("127.0.0.1", egg_ports[0]), transfer_endpoint=ErEndpoint("127.0.0.1", egg_transfer_ports[0])) roll = ErProcessor(id=1, server_node_id=self_server_node_id, processor_type=ProcessorTypes.ROLL_PAIR_MASTER, status=ProcessorStatus.RUNNING, command_endpoint=ErEndpoint("127.0.0.1", manager_port)) session = ErSession(session_id, processors=[egg, roll], options=options) context = RollPairContext(session) return context
def __init__(self, roll_site_session_id, rp_ctx: RollPairContext, options: dict = None): if options is None: options = {} self.roll_site_session_id = roll_site_session_id self.rp_ctx = rp_ctx self.push_session_enabled = RollSiteConfKeys.EGGROLL_ROLLSITE_PUSH_SESSION_ENABLED.get_with( options) if self.push_session_enabled: # create session for push roll_pair and object self._push_session = ErSession( session_id=roll_site_session_id + "_push", options=rp_ctx.get_session().get_all_options()) self._push_rp_ctx = RollPairContext(session=self._push_session) L.info( f"push_session={self._push_session.get_session_id()} enabled") def stop_push_session(): self._push_session.stop() else: self._push_session = None self._push_rp_ctx = None self.role = options["self_role"] self.party_id = str(options["self_party_id"]) self._options = options self._registered_comm_types = dict() self.register_comm_type('grpc', RollSiteGrpc) endpoint = options["proxy_endpoint"] if isinstance(endpoint, str): splitted = endpoint.split(':') self.proxy_endpoint = ErEndpoint(host=splitted[0].strip(), port=int(splitted[1].strip())) elif isinstance(endpoint, ErEndpoint): self.proxy_endpoint = endpoint else: raise ValueError("endpoint only support str and ErEndpoint type") self.is_standalone = RollSiteConfKeys.EGGROLL_ROLLSITE_DEPLOY_MODE.get_with( options) == "standalone" # if self.is_standalone: # self.stub = None # else: # channel = self.grpc_channel_factory.create_channel(self.proxy_endpoint) # self.stub = proxy_pb2_grpc.DataTransferServiceStub(channel) self.pushing_latch = CountDownLatch(0) self.rp_ctx.get_session().add_exit_task(self._wait_push_complete) if self.push_session_enabled: self.rp_ctx.get_session().add_exit_task(stop_push_session) self._wait_push_exit_timeout = int( RollSiteConfKeys.EGGROLL_ROLLSITE_PUSH_OVERALL_TIMEOUT_SEC. get_with(options)) L.info(f"inited RollSiteContext: {self.__dict__}")
def test_init_cluster(self): options = {} base_dir = os.environ['EGGROLL_HOME'] options[DeployConfKeys.CONFKEY_DEPLOY_ROLLPAIR_VENV_PATH] = os.environ[ 'EGGROLL_HOME'] / venv options[DeployConfKeys. CONFKEY_DEPLOY_ROLLPAIR_DATA_DIR_PATH] = '/tmp/eggroll' options[ ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST] = 'localhost' options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = '4670' options[DeployConfKeys. CONFKEY_DEPLOY_ROLLPAIR_PYTHON_PATH] = f'{base_dir}/python' options[ DeployConfKeys. CONFKEY_DEPLOY_ROLLPAIR_EGGPAIR_PATH] = f'{base_dir}/python/eggroll/roll_pair/egg_pair.py' options[ DeployConfKeys. CONFKEY_DEPLOY_JVM_MAINCLASS] = 'com.webank.eggroll.rollpair.Main' options[ DeployConfKeys. CONFKEY_DEPLOY_JVM_CLASSPATH] = f'{base_dir}/jvm/roll_pair/target/lib/*:{base_dir}/jvm/roll_pair/target/eggroll-roll-pair-2.0.jar:{base_dir}/jvm/roll_pair/main/resources' options[SessionConfKeys.CONFKEY_SESSION_ID] = 'testing' options[SessionConfKeys.CONFKEY_SESSION_PROCESSORS_PER_NODE] = '1' session = ErSession(session_id='test_init', options=options) context = RollPairContext(session) context.load("ns1", "n21").put("k1", "v1") print(context.load("ns1", "n21").get("k1"))
def get_standalone_context(): options = { SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE: DeployModes.STANDALONE } session = ErSession(options=options) print(session.get_session_id()) return session
def get_standalone_context(options=None): if options is None: options = {} options[SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE] = DeployModes.STANDALONE session = ErSession(options=options) print(session.get_session_id()) context = RollPairContext(session) return context
def get_cluster_context(options=None): if options is None: options = {} if 'session_id' in options: session_id = options['session_id'] else: session_id = None session = ErSession(session_id=session_id, options=options) print(session.get_session_id()) context = RollPairContext(session) return context
# See the License for the specific language governing permissions and # limitations under the License. from eggroll.core.session import ErSession from eggroll.roll_paillier_tensor.roll_paillier_tensor import RptContext from eggroll.roll_pair.roll_pair import RollPairContext import roll_paillier_tensor as rpt_engine import unittest import pandas as pd from eggroll.core.io.kv_adapter import RocksdbSortedKvAdapter #mat = pd.read_csv("/data/home/qijunhuang/czn/code/Python_C_Paillier/pData/testMat_mpi.csv").values session = ErSession(options={"eggroll.deploy.mode": "standalone"}) rptc = RptContext(RollPairContext(session)) mat = pd.read_csv("/data/czn/data/testGemmMat.csv").values vec = pd.read_csv("/data/czn/data/testGemmVec.csv").values #test lr brest_G = pd.read_csv("/data/czn/data/breast_a_egr.csv").values brest_H = pd.read_csv("/data/czn/data/breast_b_egr.csv").values brest_Y = pd.read_csv("/data/czn/data/breast_b_y_egr.csv").values #mini brest_G_mini = pd.read_csv("/data/czn/data/breast_a_egr_mini.csv").values brest_H_mini = pd.read_csv("/data/czn/data/breast_b_egr_mini.csv").values brest_Y_mini = pd.read_csv("/data/czn/data/breast_b_y_egr_mini.csv").values brest_G_py = pd.read_csv("/data/czn/data/breast_a_egr_py.csv").values
def check_actual_max_threads(): def getMemInfo(fn): def query_cmd(cmd): p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') return p[0] def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('8.8.8.8', 80)) ip = s.getsockname()[0] finally: s.close() return ip mem = psutil.virtual_memory() mem_total = round2(mem.total) mem_used = round2(mem.used) mem_used_per = str(round(mem.percent)) + '%' swap_mem = psutil.swap_memory() swap_total = round2(swap_mem.total) swap_used = round2(swap_mem.used) swap_use_per = str(round(swap_mem.percent)) + '%' data_disk = psutil.disk_usage('/data') disk_total = round2(data_disk.total) disk_used = round2(data_disk.used) disk_per = str(round(data_disk.percent)) + '%' mem_info = {} mem_info["Ip"] = get_host_ip() mem_info["MemTotal"] = mem_total mem_info["MemUsed"] = mem_used mem_info["MemUsedPCT"] = mem_used_per mem_info["SwapTotal"] = swap_total mem_info["SwapUsed"] = swap_used mem_info["SwapUsePCT"] = swap_use_per mem_info["DiskTotal"] = disk_total mem_info["DiskUsed"] = disk_used mem_info["DiskUsedPCT"] = disk_per mem_info["/proc/sys/kernel/threads-max"] = query_cmd("cat /proc/sys/kernel/threads-max") mem_info["/etc/sysctl.conf"] = query_cmd("grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'") mem_info["/proc/sys/kernel/pid_max"] = query_cmd("cat /proc/sys/kernel/pid_max") mem_info["/proc/sys/vm/max_map_count"] = query_cmd("cat /proc/sys/vm/max_map_count") mem_info["/etc/security/limits.conf"] = query_cmd("cat /etc/security/limits.conf | grep nofile | awk '{print $4}'") mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd("cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'") mem_info["/etc/sysctl.conf"] = query_cmd("grep fs.file-max /etc/sysctl.conf | awk -F= '{print $2}'") mem_info["/proc/sys/fs/file-max"] = query_cmd("cat /proc/sys/fs/file-max") mem_info["CurrentUseProcesses"] = query_cmd("pstree -p `ps -e |grep egg_pair |awk '{print $1}'` |wc -l") mem_info["NodeProcessors"] = query_cmd("grep eggroll.session.processors.per.node ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'") mem_info["PoolSize"] = query_cmd("grep eggroll.rollpair.eggpair.server.executor.pool.max.size ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'") rollsite_pid = query_cmd("ps aux | grep ${EGGROLL_HOME} | grep com.webank.eggroll.rollsite.Proxy | grep -v grep | awk '{print $2}'") if rollsite_pid: rollsite_used_memory = psutil.Process(int(rollsite_pid)).memory_info().rss myfile = open(sys.path[1] + '/../../../conf/eggroll.properties') properties = myfile.read() jvm_options = re.findall(r"(?<=MaxHeapSize=).*?(?=G)", properties) if len(jvm_options): rollsite_total_memory = int(jvm_options[0]) * 1024 * 1024 * 1024 else: rollsite_total_memory = mem.total myfile.close() mem_info["RollsiteUsedPercent"] = '{:.2%}'.format(rollsite_used_memory / (rollsite_total_memory * 4)) else: mem_info["RollsiteUsedPercent"] = 0 return mem_info session = ErSession(options={"eggroll.session.processors.per.node": args.nodes}) try: ctx = RollPairContext(session) rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) result = rp.with_stores(func=getMemInfo) print_green(str(datetime.datetime.now())) #print(json.dumps(result, indent=1)) for node in result: print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] + "G, MemUsed:" + node[1]["MemUsed"] + "G, MemUsedPCT:" + node[1]["MemUsedPCT"]) if float(node[1]["SwapTotal"]) < 128: print_red("[ERROR] The swap memory is:" + node[1]["SwapTotal"] + "G, no less than 128G.") else: print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] + "G, SwapUsed:" + node[1]["SwapUsed"] + "G, SwapUsePCT:" + node[1]["SwapUsePCT"]) print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] + "G, DiskUsed:" + node[1]["DiskUsed"] + "G, DiskUsedPCT:" + node[1]["DiskUsedPCT"]) print_green("--------------Max user processes and max file count----------------------------------------") for key in ["/proc/sys/kernel/threads-max", "/etc/sysctl.conf", "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count", "/etc/security/limits.conf", "/etc/security/limits.d/80-nofile.conf", "/etc/sysctl.conf", "/proc/sys/fs/file-max"]: if int(node[1][key]) > 65535: print_green("[OK] " + key + " = " + node[1][key]) else: print_red("[ERROR] please check " + key + " = " + node[1][key] + ", no less than 65535.") print_green("--------------Thread count check-----------------------------------------------------------") if len(node[1]["PoolSize"]) == 0: node[1]["PoolSize"] = 500 if int(node[1]["CurrentUseProcesses"]) < int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]): print_green("[OK] The thread count = %s, the total processes = %s * %s = %i" % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]))) else: print_red("[ERROR] The thread count = %s, the total processes = %s * %s = %i. eggroll.rollpair.eggpair.server.executor.pool.max.size is not enough, turn it up." % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]))) if node[1]["RollsiteUsedPercent"] != 0: print_green("----------Rollsite memory use percent--------------------------------------------------") print_yellow("[WARNING] rollsite memory use: " + node[1]["RollsiteUsedPercent"]) print("\n") finally: session.kill()
def test_init(self): session = ErSession(options={"eggroll.deploy.mode": "standalone"}) # session = ErSession() context = RollPairContext(session) #context.load("ns1", "n21").put("k1", "v1") print(context.load("ns1", "n21").get("k1"))
def check_actual_max_threads(): def getMemInfo(fn): def query_cmd(cmd): result = True p = subprocess.Popen( cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') print(p) for i in p: if int(i) < 65535: result = False return result mem = psutil.virtual_memory() mem_total = round2(mem.total) mem_used = round2(mem.used) mem_used_per = str(round(mem.percent)) + '%' swap_mem = psutil.swap_memory() swap_total = round2(swap_mem.total) swap_used = round2(swap_mem.used) swap_use_per = str(round(swap_mem.percent)) + '%' data_disk = psutil.disk_usage('/data') disk_total = round2(data_disk.total) disk_used = round2(data_disk.used) disk_per = str(round(data_disk.percent)) + '%' mem_info = {} mem_info["MemTotal"] = mem_total mem_info["MemUsed"] = mem_used mem_info["MemUsedPer"] = mem_used_per mem_info["SwapTotal"] = swap_total mem_info["SwapUsed"] = swap_used mem_info["SwapUsePer"] = swap_use_per mem_info["DiskTotal"] = disk_total mem_info["DiskUsed"] = disk_used mem_info["DiskPer"] = disk_per mem_info["/proc/sys/kernel/threads-max"] = query_cmd( "cat /proc/sys/kernel/threads-max") mem_info["/etc/sysctl.conf"] = query_cmd( "grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'") mem_info["/proc/sys/kernel/pid_max"] = query_cmd( "cat /proc/sys/kernel/pid_max") mem_info["/proc/sys/vm/max_map_count"] = query_cmd( "cat /proc/sys/vm/max_map_count") mem_info["/etc/security/limits.conf"] = query_cmd( "cat /etc/security/limits.conf | grep nofile | awk '{print $4}'") mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd( "cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'" ) mem_info["/etc/sysctl.conf"] = query_cmd( "grep fs.file-max /etc/sysctl.conf | awk -F= '{print $2}'") mem_info["/proc/sys/fs/file-max"] = query_cmd( "cat /proc/sys/fs/file-max") return mem_info session = ErSession( options={"eggroll.session.processors.per.node": args.nodes}) try: ctx = RollPairContext(session) rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) result = rp.with_stores(func=getMemInfo) print_green(str(datetime.datetime.now())) #print(json.dumps(result, indent=1)) for node in result: print_green("==============This is node :" + str(node[0]) + "================") print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] + ", MemUsed:" + node[1]["MemUsed"] + ", MemUsedPer:" + node[1]["MemUsedPer"]) print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] + ", SwapUsed:" + node[1]["SwapUsed"] + ", SwapUsePer:" + node[1]["SwapUsePer"]) print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] + ", DiskUsed:" + node[1]["DiskUsed"] + ", DiskPer:" + node[1]["DiskPer"]) print_green( "--------Max user processes and max file count--------") for key in [ "/proc/sys/kernel/threads-max", "/etc/sysctl.conf", "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count", "/etc/security/limits.conf", "/etc/security/limits.d/80-nofile.conf", "/etc/sysctl.conf", "/proc/sys/fs/file-max" ]: if node[1][key]: print_green("[OK] " + key + " is ok.") else: print_red("[ERROR] please check " + key + ", no less than 65535.") print("\n") finally: session.kill()
def check_actual_max_threads(): def getMemInfo(fn): def query_cmd(cmd): p = subprocess.Popen( cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') return p[0] def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('8.8.8.8', 80)) ip = s.getsockname()[0] finally: s.close() return ip fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py" mem_info = {} mem_info["Ip"] = get_host_ip() eggroll_home = query_cmd("echo $EGGROLL_HOME") route_file = eggroll_home + "/conf/route_table.json" f = open(route_file, encoding='utf-8') mem_info["route_table"] = json.load(f) mem_info["services"] = [ 'ClusterManagerBootstrap', 'NodeManagerBootstrap', 'rollsite', 'fate_flow_server.py', 'fateboard', 'mysql' ] mem_info["job_run"] = query_cmd( "if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi" % (fate_flow_client, fate_flow_client)) mem_info["job_wait"] = query_cmd( "if [ -f %s ];then python %s -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi" % (fate_flow_client, fate_flow_client)) mem_info["job_thread"] = [] mem_info["jobs"] = query_cmd( "array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}" % (fate_flow_client)) mem_info["job_mem"] = [] for job_id in mem_info["jobs"]: mem_info["job_thread"] = query_cmd( "ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" % (job_id)) mem_info["job_mem"] = query_cmd( "ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'" % (job_id)) mem_info["server_mem"] = {} mem_info["thread"] = {} for service in mem_info["services"]: mem_info["thread"][service] = query_cmd( "ps -ef |grep %s |grep -v grep |wc -l" % (service)) mem_info["server_mem"][service] = str( query_cmd( "ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'" % (service))) return mem_info session = ErSession( options={"eggroll.session.processors.per.node": args.nodes}) try: ctx = RollPairContext(session) rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) result = rp.with_stores(func=getMemInfo) print_green(str(datetime.datetime.now())) for node in result: print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") print_green( "-------------default route check-------------------------------------------------------" ) route_table_dict = node[1]["route_table"] if 'default' not in route_table_dict['route_table']: print_red( "[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!" ) else: try: ip = route_table_dict['route_table']['default']['default'][ 0]['ip'] port = route_table_dict['route_table']['default'][ 'default'][0]['port'] print_green("[OK] eggroll route configured!") print_green("exchange ip:{}, exchange port:{}".format( ip, port)) except KeyError: print_red( "[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!" ) print_green( "--------------fate service check-------------------------------------------------------" ) for server in node[1]["services"]: if int(node[1]["thread"][server]) > 0: print_green( "[OK] the " + server.ljust(23) + " service is running , number of processes is : " + str(node[1]["thread"][server]) + "; used memory : " + str(node[1]["server_mem"][server]) + "KB.") else: print_yellow( "[WARNING] the " + server + " service not running, please check service status.") print_green( "--------------fate_flow jobs process and mem info check--------------------------------------------------" ) if int(node[1]["job_run"]) == -1: print_red( "[ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!" ) else: print_green("[OK] Number of tasks running is " + node[1]["job_run"]) print_green("[OK] Number of tasks waiting is " + node[1]["job_wait"]) if int(node[1]["job_run"]) > 0: for job_id in node[1]["jobs"].split(" "): print_green("[OK] running task job_id : " + job_id + ", number of egg_pair processes is : " + str(node[1]["job_thread"]) + "; used memory : " + str(node[1]["job_mem"]) + "KB.") print("\n") finally: session.kill()
def get_cluster_context(options=None): if options is None: options = {} session = ErSession(options=options) print(session.get_session_id()) return session