Exemplo n.º 1
0
    def __init__(self,
                 config,
                 train_data,
                 model_creator,
                 loss_creator=None,
                 train_resize_batch_num=None,
                 eval_metrics_creator=None,
                 test_data=None,
                 validation_metrics_creator=None,
                 num_workers=1,
                 num_servers=None,
                 runner_cores=None):
        self.config = config
        self.train_data = train_data
        self.test_data = test_data
        self.model_creator = model_creator
        self.loss_creator = loss_creator
        self.validation_metrics_creator = validation_metrics_creator
        self.eval_metrics_creator = eval_metrics_creator
        self.num_workers = num_workers
        self.num_servers = num_servers if num_servers else self.num_workers
        self.train_resize_batch_num = train_resize_batch_num

        # Generate actor class
        # Add a dummy custom resource: _mxnet_worker and _mxnet_server to diff worker from server
        # if runner_cores is specified so that we can place one worker and one server on a node
        # for better performance.
        Worker = ray.remote(num_cpus=runner_cores, resources={"_mxnet_worker": 1})(MXNetRunner) \
            if runner_cores else ray.remote(MXNetRunner)
        Server = ray.remote(num_cpus=runner_cores, resources={"_mxnet_server": 1})(MXNetRunner) \
            if runner_cores else ray.remote(MXNetRunner)

        # Start runners: workers followed by servers
        self.runners = [Worker.remote() for i in range(self.num_workers)]
        self.runners += [Server.remote() for i in range(self.num_servers)]

        # Compute URL for initializing distributed setup
        ips = ray.get([runner.get_node_ip.remote() for runner in self.runners])
        ports = ray.get(
            [runner.find_free_port.remote() for runner in self.runners])
        logger = logging.getLogger()
        logger.info(ips)
        logger.info(ports)

        env = {
            "DMLC_PS_ROOT_URI": str(get_host_ip()),
            "DMLC_PS_ROOT_PORT": str(find_free_port()),
            "DMLC_NUM_SERVER": str(self.num_servers),
            "DMLC_NUM_WORKER": str(self.num_workers),
        }
        envs = []
        for i in range(self.num_workers):
            current_env = env.copy()
            current_env['DMLC_ROLE'] = 'worker'
            envs.append(current_env)
        for i in range(self.num_servers):
            current_env = env.copy()
            current_env['DMLC_ROLE'] = 'server'
            envs.append(current_env)

        env['DMLC_ROLE'] = 'scheduler'
        modified_env = os.environ.copy()
        modified_env.update(env)
        # Need to contain system env to run bash
        # TODO: Need to kill this process manually?
        subprocess.Popen("python -c 'import mxnet'",
                         shell=True,
                         env=modified_env)

        ray.get([
            runner.setup_distributed.remote(
                envs[i], self.config, self.train_data, self.model_creator,
                self.loss_creator, self.validation_metrics_creator,
                self.test_data, self.train_resize_batch_num,
                self.eval_metrics_creator)
            for i, runner in enumerate(self.runners)
        ])
Exemplo n.º 2
0
    def __init__(self,
                 config,
                 model_creator,
                 loss_creator=None,
                 eval_metrics_creator=None,
                 validation_metrics_creator=None,
                 num_workers=None,
                 num_servers=None,
                 runner_cores=None):
        ray_ctx = RayContext.get()
        if not num_workers:
            num_workers = ray_ctx.num_ray_nodes
        self.config = {} if config is None else config
        assert isinstance(config, dict), "config must be a dict"
        for param in ["optimizer", "optimizer_params", "log_interval"]:
            assert param in config, param + " must be specified in config"
        self.model_creator = model_creator
        self.loss_creator = loss_creator
        self.validation_metrics_creator = validation_metrics_creator
        self.eval_metrics_creator = eval_metrics_creator
        self.num_workers = num_workers
        self.num_servers = num_servers if num_servers else self.num_workers

        # Generate actor class
        # Add a dummy custom resource: _mxnet_worker and _mxnet_server to diff worker from server
        # if runner_cores is specified so that we can place one worker and one server on a node
        # for better performance.
        Worker = ray.remote(num_cpus=runner_cores, resources={"_mxnet_worker": 1})(MXNetRunner) \
            if runner_cores else ray.remote(MXNetRunner)
        Server = ray.remote(num_cpus=runner_cores, resources={"_mxnet_server": 1})(MXNetRunner) \
            if runner_cores else ray.remote(MXNetRunner)

        # Start runners: workers followed by servers
        self.workers = [Worker.remote() for i in range(self.num_workers)]
        self.servers = [Server.remote() for i in range(self.num_servers)]
        self.runners = self.workers + self.servers

        env = {
            "DMLC_PS_ROOT_URI": str(get_host_ip()),
            "DMLC_PS_ROOT_PORT": str(find_free_port()),
            "DMLC_NUM_SERVER": str(self.num_servers),
            "DMLC_NUM_WORKER": str(self.num_workers),
        }
        envs = []
        for i in range(self.num_workers):
            current_env = env.copy()
            current_env['DMLC_ROLE'] = 'worker'
            envs.append(current_env)
        for i in range(self.num_servers):
            current_env = env.copy()
            current_env['DMLC_ROLE'] = 'server'
            envs.append(current_env)

        env['DMLC_ROLE'] = 'scheduler'
        modified_env = os.environ.copy()
        modified_env.update(env)
        # Need to contain system env to run bash
        # TODO: Need to kill this process manually?
        subprocess.Popen("python -c 'import mxnet'",
                         shell=True,
                         env=modified_env)

        ray.get([
            runner.setup_distributed.remote(envs[i], self.config,
                                            self.model_creator,
                                            self.loss_creator,
                                            self.validation_metrics_creator,
                                            self.eval_metrics_creator)
            for i, runner in enumerate(self.runners)
        ])
Exemplo n.º 3
0
    def __init__(self,
                 config,
                 train_data,
                 model_creator,
                 loss_creator=None,
                 train_resize_batch_num=None,
                 eval_metrics_creator=None,
                 test_data=None,
                 validation_metrics_creator=None,
                 num_workers=1,
                 num_servers=None,
                 runner_cores=None):
        self.config = config
        self.model_creator = model_creator
        self.loss_creator = loss_creator
        self.validation_metrics_creator = validation_metrics_creator
        self.eval_metrics_creator = eval_metrics_creator
        self.num_workers = num_workers
        self.num_servers = num_servers if num_servers else self.num_workers
        self.train_resize_batch_num = train_resize_batch_num

        from zoo.orca.data import RayXShards, SparkXShards
        if isinstance(train_data, SparkXShards):
            train_data = train_data.repartition(self.num_workers).to_ray()
            if test_data:
                assert isinstance(test_data, SparkXShards)
                test_data = test_data.repartition(self.num_workers).to_ray()
        if isinstance(train_data, RayXShards):
            if train_data.num_partitions() != self.num_workers:
                train_data.repartition(self.num_workers)
            if test_data:
                assert isinstance(test_data, RayXShards)
                if test_data.num_partitions() != self.num_workers:
                    test_data.repartition(self.num_workers)
        self.train_data = train_data
        self.test_data = test_data

        # Generate actor class
        # Add a dummy custom resource: _mxnet_worker and _mxnet_server to diff worker from server
        # if runner_cores is specified so that we can place one worker and one server on a node
        # for better performance.
        Worker = ray.remote(num_cpus=runner_cores, resources={"_mxnet_worker": 1})(MXNetRunner) \
            if runner_cores else ray.remote(MXNetRunner)
        Server = ray.remote(num_cpus=runner_cores, resources={"_mxnet_server": 1})(MXNetRunner) \
            if runner_cores else ray.remote(MXNetRunner)

        # Start runners: workers followed by servers
        self.workers = [Worker.remote() for i in range(self.num_workers)]
        self.servers = [Server.remote() for i in range(self.num_servers)]

        if isinstance(self.train_data, RayXShards):
            self.workers = self.train_data.colocate_actors(self.workers)
        self.runners = self.workers + self.servers

        env = {
            "DMLC_PS_ROOT_URI": str(get_host_ip()),
            "DMLC_PS_ROOT_PORT": str(find_free_port()),
            "DMLC_NUM_SERVER": str(self.num_servers),
            "DMLC_NUM_WORKER": str(self.num_workers),
        }
        envs = []
        for i in range(self.num_workers):
            current_env = env.copy()
            current_env['DMLC_ROLE'] = 'worker'
            envs.append(current_env)
        for i in range(self.num_servers):
            current_env = env.copy()
            current_env['DMLC_ROLE'] = 'server'
            envs.append(current_env)

        env['DMLC_ROLE'] = 'scheduler'
        modified_env = os.environ.copy()
        modified_env.update(env)
        # Need to contain system env to run bash
        # TODO: Need to kill this process manually?
        subprocess.Popen("python -c 'import mxnet'",
                         shell=True,
                         env=modified_env)

        ray.get([
            runner.setup_distributed.remote(
                envs[i], self.config, self.train_data, self.model_creator,
                self.loss_creator, self.validation_metrics_creator,
                self.test_data, self.train_resize_batch_num,
                self.eval_metrics_creator)
            for i, runner in enumerate(self.runners)
        ])