예제 #1
0
    def __init__(self,
                 roll_site_session_id,
                 rp_ctx: RollPairContext,
                 options: dict = None):
        if options is None:
            options = {}
        self.roll_site_session_id = roll_site_session_id
        self.rp_ctx = rp_ctx

        self.push_session_enabled = RollSiteConfKeys.EGGROLL_ROLLSITE_PUSH_SESSION_ENABLED.get_with(
            options)
        if self.push_session_enabled:
            # create session for push roll_pair and object
            self._push_session = ErSession(
                session_id=roll_site_session_id + "_push",
                options=rp_ctx.get_session().get_all_options())
            self._push_rp_ctx = RollPairContext(session=self._push_session)
            L.info(
                f"push_session={self._push_session.get_session_id()} enabled")

            def stop_push_session():
                self._push_session.stop()
        else:
            self._push_session = None
            self._push_rp_ctx = None

        self.role = options["self_role"]
        self.party_id = str(options["self_party_id"])
        self._options = options

        self._registered_comm_types = dict()
        self.register_comm_type('grpc', RollSiteGrpc)

        endpoint = options["proxy_endpoint"]
        if isinstance(endpoint, str):
            splitted = endpoint.split(':')
            self.proxy_endpoint = ErEndpoint(host=splitted[0].strip(),
                                             port=int(splitted[1].strip()))
        elif isinstance(endpoint, ErEndpoint):
            self.proxy_endpoint = endpoint
        else:
            raise ValueError("endpoint only support str and ErEndpoint type")

        self.is_standalone = RollSiteConfKeys.EGGROLL_ROLLSITE_DEPLOY_MODE.get_with(
            options) == "standalone"
        # if self.is_standalone:
        #     self.stub = None
        # else:
        #     channel = self.grpc_channel_factory.create_channel(self.proxy_endpoint)
        #     self.stub = proxy_pb2_grpc.DataTransferServiceStub(channel)

        self.pushing_latch = CountDownLatch(0)
        self.rp_ctx.get_session().add_exit_task(self._wait_push_complete)
        if self.push_session_enabled:
            self.rp_ctx.get_session().add_exit_task(stop_push_session)
        self._wait_push_exit_timeout = int(
            RollSiteConfKeys.EGGROLL_ROLLSITE_PUSH_OVERALL_TIMEOUT_SEC.
            get_with(options))

        L.info(f"inited RollSiteContext: {self.__dict__}")
예제 #2
0
def get_standalone_context():
    options = {
        SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE: DeployModes.STANDALONE
    }

    session = ErSession(options=options)
    print(session.get_session_id())
    return session
예제 #3
0
def get_standalone_context(options=None):
    if options is None:
        options = {}
    options[SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE] = DeployModes.STANDALONE

    session = ErSession(options=options)
    print(session.get_session_id())
    context = RollPairContext(session)

    return context
예제 #4
0
 def __init__(self, session: ErSession):
     self.__session = session
     self.session_id = session.get_session_id()
     self.default_store_type = StoreTypes.ROLLPAIR_LMDB
     self.default_store_serdes = SerdesTypes.PICKLE
     self.deploy_mode = session.get_option(
         SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE)
     self.__session_meta = session.get_session_meta()
     self.__session.add_exit_task(self.context_gc)
     self.rpc_gc_enable = True
     self.gc_recorder = GcRecorder(self)
예제 #5
0
def get_cluster_context(options=None):
    if options is None:
        options = {}

    if 'session_id' in options:
        session_id = options['session_id']
    else:
        session_id = None
    session = ErSession(session_id=session_id, options=options)
    print(session.get_session_id())
    context = RollPairContext(session)
    return context
예제 #6
0
 def __init__(self, session: ErSession):
     if session.get_session_meta()._status != SessionStatus.ACTIVE:
         raise Exception(f"session_id={session.get_session_id()} is not ACTIVE. current status={session.get_session_meta()._status}")
     self.__session = session
     self.session_id = session.get_session_id()
     default_store_type_str = RollPairConfKeys.EGGROLL_ROLLPAIR_DEFAULT_STORE_TYPE.get_with(session.get_all_options())
     self.default_store_type = getattr(StoreTypes, default_store_type_str, None)
     if not self.default_store_type:
         raise ValueError(f'store type "{default_store_type_str}" not found for roll pair')
     self.default_store_serdes = SerdesTypes.PICKLE
     self.deploy_mode = session.get_option(SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE)
     self.__session_meta = session.get_session_meta()
     self.__session.add_exit_task(self.context_gc)
     self.rpc_gc_enable = True
     self.gc_recorder = GcRecorder(self)
     self.__command_client = CommandClient()
예제 #7
0
def get_debug_test_context(is_standalone=False, manager_port=4670, egg_port=20001, transfer_port=20002, session_id='testing'):
    manager_port = manager_port
    egg_ports = [egg_port]
    egg_transfer_ports = [transfer_port]
    self_server_node_id = 2

    options = {}
    if is_standalone:
        options[SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE] = "standalone"
    options[TransferConfKeys.CONFKEY_TRANSFER_SERVICE_HOST] = "127.0.0.1"
    options[TransferConfKeys.CONFKEY_TRANSFER_SERVICE_PORT] = str(transfer_port)
    options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = str(manager_port)
    options[NodeManagerConfKeys.CONFKEY_NODE_MANAGER_PORT] = str(manager_port)

    egg = ErProcessor(id=1,
                      server_node_id=self_server_node_id,
                      processor_type=ProcessorTypes.EGG_PAIR,
                      status=ProcessorStatus.RUNNING,
                      command_endpoint=ErEndpoint("127.0.0.1", egg_ports[0]),
                      transfer_endpoint=ErEndpoint("127.0.0.1",
                                                   egg_transfer_ports[0]))

    roll = ErProcessor(id=1,
                       server_node_id=self_server_node_id,
                       processor_type=ProcessorTypes.ROLL_PAIR_MASTER,
                       status=ProcessorStatus.RUNNING,
                       command_endpoint=ErEndpoint("127.0.0.1", manager_port))

    session = ErSession(session_id,
                        processors=[egg, roll],
                        options=options)
    context = RollPairContext(session)
    return context
예제 #8
0
 def __init__(self, session: ErSession):
     if session.get_session_meta()._status != SessionStatus.ACTIVE:
         raise Exception(
             f"session:{session.get_session_id()} is not available, init first!"
         )
     self.__session = session
     self.session_id = session.get_session_id()
     self.default_store_type = StoreTypes.ROLLPAIR_LMDB
     self.default_store_serdes = SerdesTypes.PICKLE
     self.deploy_mode = session.get_option(
         SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE)
     self.__session_meta = session.get_session_meta()
     self.__session.add_exit_task(self.context_gc)
     self.rpc_gc_enable = True
     self.gc_recorder = GcRecorder(self)
     self.__command_client = CommandClient()
예제 #9
0
    def test_init_cluster(self):
        options = {}
        base_dir = os.environ['EGGROLL_HOME']
        options[DeployConfKeys.CONFKEY_DEPLOY_ROLLPAIR_VENV_PATH] = os.environ[
            'EGGROLL_HOME'] / venv
        options[DeployConfKeys.
                CONFKEY_DEPLOY_ROLLPAIR_DATA_DIR_PATH] = '/tmp/eggroll'
        options[
            ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST] = 'localhost'
        options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = '4670'

        options[DeployConfKeys.
                CONFKEY_DEPLOY_ROLLPAIR_PYTHON_PATH] = f'{base_dir}/python'
        options[
            DeployConfKeys.
            CONFKEY_DEPLOY_ROLLPAIR_EGGPAIR_PATH] = f'{base_dir}/python/eggroll/roll_pair/egg_pair.py'
        options[
            DeployConfKeys.
            CONFKEY_DEPLOY_JVM_MAINCLASS] = 'com.webank.eggroll.rollpair.Main'
        options[
            DeployConfKeys.
            CONFKEY_DEPLOY_JVM_CLASSPATH] = f'{base_dir}/jvm/roll_pair/target/lib/*:{base_dir}/jvm/roll_pair/target/eggroll-roll-pair-2.0.jar:{base_dir}/jvm/roll_pair/main/resources'
        options[SessionConfKeys.CONFKEY_SESSION_ID] = 'testing'
        options[SessionConfKeys.CONFKEY_SESSION_PROCESSORS_PER_NODE] = '1'

        session = ErSession(session_id='test_init', options=options)
        context = RollPairContext(session)

        context.load("ns1", "n21").put("k1", "v1")
        print(context.load("ns1", "n21").get("k1"))
예제 #10
0
    def test_get_and_stop_and_kill_session(self):
        session = self.ctx.get_session()
        id = session.get_session_id()

        session.stop()

        from eggroll.core.session import ErSession
        dead_session = ErSession(id)
        dead_session.stop()

        dead_session = ErSession(id)
        dead_session.kill()
예제 #11
0
    def __init__(self, session: ErSession):
        if session.get_session_meta()._status != SessionStatus.ACTIVE:
            raise Exception(
                f"session_id={session.get_session_id()} is not ACTIVE. current status={session.get_session_meta()._status}"
            )
        self.__session = session
        self.session_id = session.get_session_id()
        default_store_type_str = RollPairConfKeys.EGGROLL_ROLLPAIR_DEFAULT_STORE_TYPE.get_with(
            session.get_all_options())
        self.default_store_type = getattr(StoreTypes, default_store_type_str,
                                          None)
        if not self.default_store_type:
            raise ValueError(
                f'store type "{default_store_type_str}" not found for roll pair'
            )
        self.in_memory_output = RollPairConfKeys.EGGROLL_ROLLPAIR_IN_MEMORY_OUTPUT.get_with(
            session.get_all_options())
        if not self.default_store_type:
            raise ValueError(
                f'in_memory_output "{self.in_memory_output}" not found for roll pair'
            )

        self.default_store_serdes = SerdesTypes.PICKLE
        self.__session_meta = session.get_session_meta()
        self.__session.add_exit_task(self.context_gc)
        self.rpc_gc_enable = True
        self.gc_recorder = GcRecorder(self)
        self.__command_client = CommandClient()

        self.session_default_rp = self.load(name=self.session_id,
                                            namespace=f'er_session_meta',
                                            options={
                                                'total_partitions':
                                                session.get_eggs_count(),
                                                'store_type':
                                                StoreTypes.ROLLPAIR_CACHE,
                                                'create_if_missing':
                                                True
                                            })
        eggs = session.get_eggs()

        def _broadcast_eggs(task: ErTask):
            from eggroll.core.utils import add_runtime_storage
            _input = task._inputs[0]
            add_runtime_storage("__eggs", eggs)
            L.debug(f"runtime_storage={get_runtime_storage('__eggs')}")

        self.session_default_rp.with_stores(func=_broadcast_eggs)
예제 #12
0
파일: env_check.py 프로젝트: xkazm/FATE
def check_actual_max_threads():
    def getMemInfo(fn):
        def query_cmd(cmd):
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n')
            return p[0]
 
        def get_host_ip():
            try:
                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                s.connect(('8.8.8.8', 80))
                ip = s.getsockname()[0]
            finally:
                s.close()
            return ip

        mem = psutil.virtual_memory()
        mem_total = round2(mem.total)
        mem_used = round2(mem.used)
        mem_used_per = str(round(mem.percent)) + '%'

        swap_mem = psutil.swap_memory()
        swap_total = round2(swap_mem.total)
        swap_used = round2(swap_mem.used)
        swap_use_per = str(round(swap_mem.percent)) + '%'

        data_disk = psutil.disk_usage('/data')
        disk_total = round2(data_disk.total)
        disk_used = round2(data_disk.used)
        disk_per = str(round(data_disk.percent)) + '%'

        mem_info = {}
        mem_info["Ip"] = get_host_ip()
        mem_info["MemTotal"] = mem_total
        mem_info["MemUsed"] = mem_used
        mem_info["MemUsedPCT"] = mem_used_per

        mem_info["SwapTotal"] = swap_total
        mem_info["SwapUsed"] = swap_used
        mem_info["SwapUsePCT"] = swap_use_per

        mem_info["DiskTotal"] = disk_total
        mem_info["DiskUsed"] = disk_used
        mem_info["DiskUsedPCT"] = disk_per

        mem_info["/proc/sys/kernel/threads-max"] = query_cmd("cat /proc/sys/kernel/threads-max")
        mem_info["/etc/sysctl.conf"] = query_cmd("grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/kernel/pid_max"] = query_cmd("cat /proc/sys/kernel/pid_max")
        mem_info["/proc/sys/vm/max_map_count"] = query_cmd("cat /proc/sys/vm/max_map_count")

        mem_info["/etc/security/limits.conf"] = query_cmd("cat /etc/security/limits.conf | grep nofile | awk '{print $4}'")
        mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd("cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'")
        mem_info["/etc/sysctl.conf"] = query_cmd("grep fs.file-max  /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/fs/file-max"] = query_cmd("cat /proc/sys/fs/file-max")

        mem_info["CurrentUseProcesses"] = query_cmd("pstree -p `ps -e |grep egg_pair |awk '{print $1}'` |wc -l")
        mem_info["NodeProcessors"] = query_cmd("grep eggroll.session.processors.per.node ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'")
        mem_info["PoolSize"] = query_cmd("grep eggroll.rollpair.eggpair.server.executor.pool.max.size ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'")

        rollsite_pid = query_cmd("ps aux | grep ${EGGROLL_HOME} | grep com.webank.eggroll.rollsite.Proxy | grep -v grep | awk '{print $2}'")
        if rollsite_pid:
            rollsite_used_memory = psutil.Process(int(rollsite_pid)).memory_info().rss
            myfile = open(sys.path[1] + '/../../../conf/eggroll.properties')
            properties = myfile.read()
            jvm_options = re.findall(r"(?<=MaxHeapSize=).*?(?=G)", properties)
            if len(jvm_options):
                rollsite_total_memory = int(jvm_options[0]) * 1024 * 1024 * 1024
            else:
                rollsite_total_memory = mem.total
            myfile.close()

            mem_info["RollsiteUsedPercent"] = '{:.2%}'.format(rollsite_used_memory / (rollsite_total_memory * 4))
        else:
            mem_info["RollsiteUsedPercent"] = 0


        return mem_info

    session = ErSession(options={"eggroll.session.processors.per.node": args.nodes})
    try:
        ctx = RollPairContext(session)
        rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions})
        result = rp.with_stores(func=getMemInfo)
        print_green(str(datetime.datetime.now()))
        #print(json.dumps(result, indent=1))
        for node in result:
            print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================")
            print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] + "G, MemUsed:" + node[1]["MemUsed"] + "G, MemUsedPCT:" + node[1]["MemUsedPCT"])
            if float(node[1]["SwapTotal"]) < 128:
                print_red("[ERROR] The swap memory is:" + node[1]["SwapTotal"] + "G, no less than 128G.")
            else:
                print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] + "G, SwapUsed:" + node[1]["SwapUsed"] + "G, SwapUsePCT:" + node[1]["SwapUsePCT"])
            print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] + "G, DiskUsed:" + node[1]["DiskUsed"] + "G, DiskUsedPCT:" + node[1]["DiskUsedPCT"])
            print_green("--------------Max user processes and max file count----------------------------------------")
            for key in ["/proc/sys/kernel/threads-max", "/etc/sysctl.conf", "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count", "/etc/security/limits.conf", "/etc/security/limits.d/80-nofile.conf", "/etc/sysctl.conf", "/proc/sys/fs/file-max"]:
                if int(node[1][key]) > 65535:
                    print_green("[OK] " + key + " = " + node[1][key])
                else:
                    print_red("[ERROR] please check " + key + " = " + node[1][key] + ", no less than 65535.")
            print_green("--------------Thread count check-----------------------------------------------------------")
            if len(node[1]["PoolSize"]) == 0:
                node[1]["PoolSize"] = 500
            if int(node[1]["CurrentUseProcesses"]) < int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]):
                print_green("[OK] The thread count = %s, the total processes = %s * %s = %i" % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"])))
            else:
                print_red("[ERROR] The thread count = %s, the total processes = %s * %s = %i. eggroll.rollpair.eggpair.server.executor.pool.max.size is not enough, turn it up." % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"])))
            if node[1]["RollsiteUsedPercent"] != 0:
                print_green("----------Rollsite memory use percent--------------------------------------------------")
                print_yellow("[WARNING] rollsite memory use: " + node[1]["RollsiteUsedPercent"])
            print("\n")
    finally:
        session.kill()
예제 #13
0
class RollSiteContext:
    grpc_channel_factory = GrpcChannelFactory()

    def __init__(self,
                 roll_site_session_id,
                 rp_ctx: RollPairContext,
                 options: dict = None):
        if options is None:
            options = {}
        self.roll_site_session_id = roll_site_session_id
        self.rp_ctx = rp_ctx

        self.push_session_enabled = RollSiteConfKeys.EGGROLL_ROLLSITE_PUSH_SESSION_ENABLED.get_with(
            options)
        if self.push_session_enabled:
            # create session for push roll_pair and object
            self._push_session = ErSession(
                session_id=roll_site_session_id + "_push",
                options=rp_ctx.get_session().get_all_options())
            self._push_rp_ctx = RollPairContext(session=self._push_session)
            L.info(
                f"push_session={self._push_session.get_session_id()} enabled")

            def stop_push_session():
                self._push_session.stop()
        else:
            self._push_session = None
            self._push_rp_ctx = None

        self.role = options["self_role"]
        self.party_id = str(options["self_party_id"])
        self._options = options

        self._registered_comm_types = dict()
        self.register_comm_type('grpc', RollSiteGrpc)

        endpoint = options["proxy_endpoint"]
        if isinstance(endpoint, str):
            splitted = endpoint.split(':')
            self.proxy_endpoint = ErEndpoint(host=splitted[0].strip(),
                                             port=int(splitted[1].strip()))
        elif isinstance(endpoint, ErEndpoint):
            self.proxy_endpoint = endpoint
        else:
            raise ValueError("endpoint only support str and ErEndpoint type")

        self.is_standalone = RollSiteConfKeys.EGGROLL_ROLLSITE_DEPLOY_MODE.get_with(
            options) == "standalone"
        # if self.is_standalone:
        #     self.stub = None
        # else:
        #     channel = self.grpc_channel_factory.create_channel(self.proxy_endpoint)
        #     self.stub = proxy_pb2_grpc.DataTransferServiceStub(channel)

        self.pushing_latch = CountDownLatch(0)
        self.rp_ctx.get_session().add_exit_task(self._wait_push_complete)
        if self.push_session_enabled:
            self.rp_ctx.get_session().add_exit_task(stop_push_session)
        self._wait_push_exit_timeout = int(
            RollSiteConfKeys.EGGROLL_ROLLSITE_PUSH_OVERALL_TIMEOUT_SEC.
            get_with(options))

        L.info(f"inited RollSiteContext: {self.__dict__}")

    def _wait_push_complete(self):
        session_id = self.rp_ctx.get_session().get_session_id()
        L.info(f"running roll site exit func for er session={session_id},"
               f" roll site session id={self.roll_site_session_id}")
        residual_count = self.pushing_latch.await_latch(
            self._wait_push_exit_timeout)
        if residual_count != 0:
            L.error(
                f"exit session when not finish push: "
                f"residual_count={residual_count}, timeout={self._wait_push_exit_timeout}"
            )

    def load(self, name: str, tag: str, options: dict = None):
        if options is None:
            options = {}
        final_options = self._options.copy()
        final_options.update(options)
        return RollSite(name, tag, self, options=final_options)

    def register_comm_type(self, name, clazz):
        self._registered_comm_types[name] = clazz

    def get_comm_impl(self, name):
        if name in self._registered_comm_types:
            return self._registered_comm_types[name]
        else:
            raise ValueError(f'comm_type={name} is not registered')
예제 #14
0
파일: server_check.py 프로젝트: xkazm/FATE
def check_actual_max_threads():
    def getMemInfo(fn):
        def query_cmd(cmd):
            p = subprocess.Popen(
                cmd, stdout=subprocess.PIPE,
                shell=True).communicate()[0].decode().strip().split('\n')
            return p[0]

        def get_host_ip():
            try:
                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                s.connect(('8.8.8.8', 80))
                ip = s.getsockname()[0]
            finally:
                s.close()
            return ip

        fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py"
        mem_info = {}
        mem_info["Ip"] = get_host_ip()
        eggroll_home = query_cmd("echo $EGGROLL_HOME")
        route_file = eggroll_home + "/conf/route_table.json"
        f = open(route_file, encoding='utf-8')
        mem_info["route_table"] = json.load(f)
        mem_info["services"] = [
            'ClusterManagerBootstrap', 'NodeManagerBootstrap', 'rollsite',
            'fate_flow_server.py', 'fateboard', 'mysql'
        ]
        mem_info["job_run"] = query_cmd(
            "if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi"
            % (fate_flow_client, fate_flow_client))
        mem_info["job_wait"] = query_cmd(
            "if [ -f %s ];then python %s -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi"
            % (fate_flow_client, fate_flow_client))
        mem_info["job_thread"] = []
        mem_info["jobs"] = query_cmd(
            "array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}"
            % (fate_flow_client))
        mem_info["job_mem"] = []
        for job_id in mem_info["jobs"]:
            mem_info["job_thread"] = query_cmd(
                "ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" %
                (job_id))
            mem_info["job_mem"] = query_cmd(
                "ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'"
                % (job_id))
        mem_info["server_mem"] = {}
        mem_info["thread"] = {}
        for service in mem_info["services"]:
            mem_info["thread"][service] = query_cmd(
                "ps -ef |grep %s |grep -v grep |wc -l" % (service))
            mem_info["server_mem"][service] = str(
                query_cmd(
                    "ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'"
                    % (service)))
        return mem_info

    session = ErSession(
        options={"eggroll.session.processors.per.node": args.nodes})
    try:
        ctx = RollPairContext(session)
        rp = ctx.parallelize(str_generator(row_limit=1000),
                             options={'total_partitions': args.partitions})
        result = rp.with_stores(func=getMemInfo)
        print_green(str(datetime.datetime.now()))
        for node in result:
            print_green("==============This is node " + str(node[0]) + ":" +
                        node[1]["Ip"] +
                        "===========================================")
            print_green(
                "-------------default route check-------------------------------------------------------"
            )
            route_table_dict = node[1]["route_table"]
            if 'default' not in route_table_dict['route_table']:
                print_red(
                    "[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!"
                )
            else:
                try:
                    ip = route_table_dict['route_table']['default']['default'][
                        0]['ip']
                    port = route_table_dict['route_table']['default'][
                        'default'][0]['port']
                    print_green("[OK] eggroll route configured!")
                    print_green("exchange ip:{}, exchange port:{}".format(
                        ip, port))
                except KeyError:
                    print_red(
                        "[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!"
                    )

            print_green(
                "--------------fate service check-------------------------------------------------------"
            )
            for server in node[1]["services"]:
                if int(node[1]["thread"][server]) > 0:
                    print_green(
                        "[OK] the " + server.ljust(23) +
                        " service is running , number of processes is : " +
                        str(node[1]["thread"][server]) + "; used memory : " +
                        str(node[1]["server_mem"][server]) + "KB.")
                else:
                    print_yellow(
                        "[WARNING] the " + server +
                        " service not running, please check service status.")

            print_green(
                "--------------fate_flow jobs process and mem info check--------------------------------------------------"
            )
            if int(node[1]["job_run"]) == -1:
                print_red(
                    "[ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!"
                )
            else:
                print_green("[OK] Number of tasks running is " +
                            node[1]["job_run"])
                print_green("[OK] Number of tasks waiting is " +
                            node[1]["job_wait"])
                if int(node[1]["job_run"]) > 0:
                    for job_id in node[1]["jobs"].split(" "):
                        print_green("[OK] running task job_id : " + job_id +
                                    ", number of egg_pair processes is : " +
                                    str(node[1]["job_thread"]) +
                                    "; used memory : " +
                                    str(node[1]["job_mem"]) + "KB.")

            print("\n")
    finally:
        session.kill()
예제 #15
0
#  See the License for the specific language governing permissions and
#  limitations under the License.

from eggroll.core.session import ErSession
from eggroll.roll_paillier_tensor.roll_paillier_tensor import RptContext
from eggroll.roll_pair.roll_pair import RollPairContext

import roll_paillier_tensor as rpt_engine
import unittest
import pandas as pd
from eggroll.core.io.kv_adapter import RocksdbSortedKvAdapter

#mat = pd.read_csv("/data/home/qijunhuang/czn/code/Python_C_Paillier/pData/testMat_mpi.csv").values


session = ErSession(options={"eggroll.deploy.mode": "standalone"})
rptc = RptContext(RollPairContext(session))


mat = pd.read_csv("/data/czn/data/testGemmMat.csv").values
vec = pd.read_csv("/data/czn/data/testGemmVec.csv").values
#test lr
brest_G = pd.read_csv("/data/czn/data/breast_a_egr.csv").values
brest_H = pd.read_csv("/data/czn/data/breast_b_egr.csv").values
brest_Y = pd.read_csv("/data/czn/data/breast_b_y_egr.csv").values

#mini
brest_G_mini = pd.read_csv("/data/czn/data/breast_a_egr_mini.csv").values
brest_H_mini = pd.read_csv("/data/czn/data/breast_b_egr_mini.csv").values
brest_Y_mini = pd.read_csv("/data/czn/data/breast_b_y_egr_mini.csv").values
brest_G_py = pd.read_csv("/data/czn/data/breast_a_egr_py.csv").values
예제 #16
0
def get_cluster_context(options=None):
    if options is None:
        options = {}
    session = ErSession(options=options)
    print(session.get_session_id())
    return session
예제 #17
0
 def test_init(self):
     session = ErSession(options={"eggroll.deploy.mode": "standalone"})
     # session = ErSession()
     context = RollPairContext(session)
     #context.load("ns1", "n21").put("k1", "v1")
     print(context.load("ns1", "n21").get("k1"))
예제 #18
0
def check_actual_max_threads():
    def getMemInfo(fn):
        def query_cmd(cmd):
            result = True
            p = subprocess.Popen(
                cmd, stdout=subprocess.PIPE,
                shell=True).communicate()[0].decode().strip().split('\n')
            print(p)
            for i in p:
                if int(i) < 65535:
                    result = False
            return result

        mem = psutil.virtual_memory()
        mem_total = round2(mem.total)
        mem_used = round2(mem.used)
        mem_used_per = str(round(mem.percent)) + '%'

        swap_mem = psutil.swap_memory()
        swap_total = round2(swap_mem.total)
        swap_used = round2(swap_mem.used)
        swap_use_per = str(round(swap_mem.percent)) + '%'

        data_disk = psutil.disk_usage('/data')
        disk_total = round2(data_disk.total)
        disk_used = round2(data_disk.used)
        disk_per = str(round(data_disk.percent)) + '%'

        mem_info = {}
        mem_info["MemTotal"] = mem_total
        mem_info["MemUsed"] = mem_used
        mem_info["MemUsedPer"] = mem_used_per

        mem_info["SwapTotal"] = swap_total
        mem_info["SwapUsed"] = swap_used
        mem_info["SwapUsePer"] = swap_use_per

        mem_info["DiskTotal"] = disk_total
        mem_info["DiskUsed"] = disk_used
        mem_info["DiskPer"] = disk_per

        mem_info["/proc/sys/kernel/threads-max"] = query_cmd(
            "cat /proc/sys/kernel/threads-max")
        mem_info["/etc/sysctl.conf"] = query_cmd(
            "grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/kernel/pid_max"] = query_cmd(
            "cat /proc/sys/kernel/pid_max")
        mem_info["/proc/sys/vm/max_map_count"] = query_cmd(
            "cat /proc/sys/vm/max_map_count")

        mem_info["/etc/security/limits.conf"] = query_cmd(
            "cat /etc/security/limits.conf | grep nofile | awk '{print $4}'")
        mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd(
            "cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'"
        )
        mem_info["/etc/sysctl.conf"] = query_cmd(
            "grep fs.file-max  /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/fs/file-max"] = query_cmd(
            "cat /proc/sys/fs/file-max")

        return mem_info

    session = ErSession(
        options={"eggroll.session.processors.per.node": args.nodes})
    try:
        ctx = RollPairContext(session)
        rp = ctx.parallelize(str_generator(row_limit=1000),
                             options={'total_partitions': args.partitions})
        result = rp.with_stores(func=getMemInfo)
        print_green(str(datetime.datetime.now()))
        #print(json.dumps(result, indent=1))
        for node in result:
            print_green("==============This is node :" + str(node[0]) +
                        "================")
            print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] +
                         ", MemUsed:" + node[1]["MemUsed"] + ", MemUsedPer:" +
                         node[1]["MemUsedPer"])
            print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] +
                         ", SwapUsed:" + node[1]["SwapUsed"] +
                         ", SwapUsePer:" + node[1]["SwapUsePer"])
            print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] +
                         ", DiskUsed:" + node[1]["DiskUsed"] + ", DiskPer:" +
                         node[1]["DiskPer"])
            print_green(
                "--------Max user processes and max file count--------")
            for key in [
                    "/proc/sys/kernel/threads-max", "/etc/sysctl.conf",
                    "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count",
                    "/etc/security/limits.conf",
                    "/etc/security/limits.d/80-nofile.conf",
                    "/etc/sysctl.conf", "/proc/sys/fs/file-max"
            ]:
                if node[1][key]:
                    print_green("[OK] " + key + " is ok.")
                else:
                    print_red("[ERROR] please check " + key +
                              ", no less than 65535.")
            print("\n")
    finally:
        session.kill()