Exemplo n.º 1
0
    def __init__(self, hosts=None, processes_per_node=1, env=None):
        driver_ip = get_node_ip()
        if hosts is None:  # Single node
            self.hosts = [driver_ip]
        elif hosts == "all":  # All executor nodes in the cluster

            def get_ip(iter):
                yield get_node_ip()

            from bigdl.dllib.utils.common import get_node_and_core_number
            from bigdl.orca import OrcaContext
            sc = OrcaContext.get_spark_context()
            node_num, core_num = get_node_and_core_number()
            total_cores = node_num * core_num
            self.hosts = list(
                set(
                    sc.range(0, total_cores,
                             numSlices=total_cores).barrier().mapPartitions(
                                 get_ip).collect()))
        else:  # User specified hosts, assumed to be non-duplicate
            assert isinstance(hosts, list)
            self.hosts = hosts

        self.master = self.hosts[0]
        print("Master: ", self.master)
        self.remote_hosts = []
        for host in self.hosts:
            if host != driver_ip:
                self.remote_hosts.append(host)
        print("Remote hosts: ", self.remote_hosts)
        print("Hosts: ", self.hosts)
        self.processes_per_node = processes_per_node
        self.env = env if env else {}
Exemplo n.º 2
0
        def _start_raylets(iter):
            from bigdl.dllib.utils.utils import get_node_ip
            current_ip = get_node_ip()
            master_ip = redis_address.split(":")[0]
            do_start = True
            process_info = None
            base_path = tempfile.gettempdir()
            ray_master_flag_path = os.path.join(base_path, self.ray_master_flag)
            # If there is already a ray master on this node, we need to start one less raylet.
            if current_ip == master_ip:
                ray_master_lock_path = os.path.join(base_path, self.ray_master_lock)
                with filelock.FileLock(ray_master_lock_path):
                    if not os.path.exists(ray_master_flag_path):
                        os.mknod(ray_master_flag_path)
                        do_start = False
            if do_start:
                raylet_lock_path = os.path.join(base_path, self.raylet_lock)
                with filelock.FileLock(raylet_lock_path):
                    process_info = self._start_ray_node(
                        command=RayServiceFuncGenerator._get_raylet_command(
                            redis_address=redis_address,
                            ray_exec=self.ray_exec,
                            password=self.password,
                            ray_node_cpu_cores=self.ray_node_cpu_cores,
                            labels=self.labels,
                            object_store_memory=self.object_store_memory,
                            extra_params=self.extra_params),
                        tag="raylet")
                    kill_redundant_log_monitors(redis_address=redis_address)
            # Cannot remove ray_master_flag at the end of this task since no barrier is guaranteed.

            yield process_info
Exemplo n.º 3
0
 def shutdown_plasma(self):
     for host in self.hosts:
         if host != get_node_ip():
             p = subprocess.Popen(
                 ["ssh", "root@{}".format(host), "pkill plasma"])
         else:
             p = subprocess.Popen(["pkill", "plasma"])
         os.waitpid(p.pid, 0)
Exemplo n.º 4
0
    def init_spark_on_k8s(self,
                          master,
                          container_image,
                          num_executors,
                          executor_cores,
                          executor_memory="2g",
                          driver_memory="1g",
                          driver_cores=4,
                          extra_executor_memory_for_ray=None,
                          extra_python_lib=None,
                          conf=None,
                          jars=None,
                          python_location=None):
        print("Initializing SparkContext for k8s-client mode")
        if "PYSPARK_PYTHON" not in os.environ:
            os.environ["PYSPARK_PYTHON"] = \
                python_location if python_location else detect_python_location()
        python_env = "/".join(os.environ["PYSPARK_PYTHON"].split("/")[:-2])

        submit_args = "--master " + master + " --deploy-mode client"
        submit_args = submit_args + gen_submit_args(
            driver_cores, driver_memory, num_executors, executor_cores,
            executor_memory, extra_python_lib, jars)

        conf = enrich_conf_for_spark(conf, driver_cores, driver_memory, num_executors,
                                     executor_cores, executor_memory, extra_executor_memory_for_ray)
        py_version = ".".join(platform.python_version().split(".")[0:2])
        preload_so = python_env + "/lib/libpython" + py_version + "m.so"
        ld_path = python_env + "/lib:" + python_env + "/lib/python" +\
            py_version + "/lib-dynload"
        if "spark.executor.extraLibraryPath" in conf:
            ld_path = "{}:{}".format(ld_path, conf["spark.executor.extraLibraryPath"])
        conf.update({"spark.cores.max": num_executors * executor_cores,
                     "spark.executorEnv.PYTHONHOME": python_env,
                     "spark.executor.extraLibraryPath": ld_path,
                     "spark.executorEnv.LD_PRELOAD": preload_so,
                     "spark.kubernetes.container.image": container_image})
        if "spark.driver.host" not in conf:
            conf["spark.driver.host"] = get_node_ip()
        if "spark.driver.port" not in conf:
            conf["spark.driver.port"] = random.randint(10000, 65535)
        if "BIGDL_CLASSPATH" in os.environ:
            zoo_bigdl_jar_path = os.environ["BIGDL_CLASSPATH"]
        else:
            zoo_bigdl_jar_path = get_zoo_bigdl_classpath_on_driver()
        if "spark.executor.extraClassPath" in conf:
            conf["spark.executor.extraClassPath"] = "{}:{}".format(
                zoo_bigdl_jar_path, conf["spark.executor.extraClassPath"])
        else:
            conf["spark.executor.extraClassPath"] = zoo_bigdl_jar_path

        sc = self.create_sc(submit_args, conf)
        return sc
Exemplo n.º 5
0
 def _start_ray_master(index, iter):
     from bigdl.dllib.utils.utils import get_node_ip
     process_info = None
     if index == 0:
         print("partition id is : {}".format(index))
         current_ip = get_node_ip()
         print("master address {}".format(current_ip))
         redis_address = "{}:{}".format(current_ip, self.redis_port)
         process_info = self._start_ray_node(command=self._gen_master_command(),
                                             tag="ray-master")
         process_info.master_addr = redis_address
     yield process_info
Exemplo n.º 6
0
 def f(index, iterator):
     import pyarrow.plasma as plasma
     client = plasma.connect(address)
     part_size = 1000000  # TODO: Make subpartition size configurable?
     buffer = []
     sub_index = 0
     for record in iterator:
         if len(buffer) == part_size:
             res_buffer = process_records(buffer)
             object_id = client.put(res_buffer)
             buffer = [record]
             yield index, sub_index, part_size, object_id, get_node_ip()
             sub_index += 1
         else:
             buffer.append(record)
     remain_size = len(buffer)
     if remain_size > 0:
         res_buffer = process_records(buffer)
         object_id = client.put(res_buffer)
         buffer = []
         client.disconnect()
         yield index, sub_index, remain_size, object_id, get_node_ip()
     else:
         client.disconnect()
Exemplo n.º 7
0
        def _start_ray_services(iter):
            from pyspark import BarrierTaskContext
            from bigdl.dllib.utils.utils import get_node_ip
            tc = BarrierTaskContext.get()
            current_ip = get_node_ip()
            print("current address {}".format(current_ip))
            print("master address {}".format(master_ip))
            redis_address = "{}:{}".format(master_ip, self.redis_port)
            process_info = None
            base_path = tempfile.gettempdir()
            ray_master_flag_path = os.path.join(base_path, self.ray_master_flag)
            if current_ip == master_ip:  # Start the ray master.
                # It is possible that multiple executors are on one node. In this case,
                # the first executor that gets the lock would be the master and it would
                # create a flag to indicate the master has initialized.
                # The flag file is removed when ray start processes finish so that this
                # won't affect other programs.
                ray_master_lock_path = os.path.join(base_path, self.ray_master_lock)
                with filelock.FileLock(ray_master_lock_path):
                    if not os.path.exists(ray_master_flag_path):
                        print("partition id is : {}".format(tc.partitionId()))
                        process_info = self._start_ray_node(command=self._gen_master_command(),
                                                            tag="ray-master")
                        process_info.master_addr = redis_address
                        os.mknod(ray_master_flag_path)

            tc.barrier()
            if not process_info:  # Start raylets.
                # Add a lock to avoid starting multiple raylets on one node at the same time.
                # See this issue: https://github.com/ray-project/ray/issues/10154
                raylet_lock_path = os.path.join(base_path, self.raylet_lock)
                with filelock.FileLock(raylet_lock_path):
                    print("partition id is : {}".format(tc.partitionId()))
                    process_info = self._start_ray_node(
                        command=RayServiceFuncGenerator._get_raylet_command(
                            redis_address=redis_address,
                            ray_exec=self.ray_exec,
                            password=self.password,
                            ray_node_cpu_cores=self.ray_node_cpu_cores,
                            labels=self.labels,
                            object_store_memory=self.object_store_memory,
                            extra_params=self.extra_params),
                        tag="raylet")
                    kill_redundant_log_monitors(redis_address=redis_address)

            if os.path.exists(ray_master_flag_path):
                os.remove(ray_master_flag_path)
            yield process_info
Exemplo n.º 8
0
 def launch_plasma(self, object_store_memory="2g"):
     import atexit
     atexit.register(self.shutdown_plasma)
     # TODO: Or can use spark to launch plasma
     from bigdl.orca.ray.utils import resource_to_bytes
     self.plasma_path = "/".join(
         sys.executable.split("/")[:-1] + ["plasma_store"])
     self.object_store_memory = resource_to_bytes(object_store_memory)
     self.object_store_address = "/tmp/analytics_zoo_plasma"
     command = "{} -m {} -s {}".format(self.plasma_path,
                                       self.object_store_memory,
                                       self.object_store_address)
     for host in self.hosts:
         if host != get_node_ip():
             p = subprocess.Popen(["ssh", "root@{}".format(host), command])
         else:
             p = subprocess.Popen(command.split())
         print("Plasma launched on {}".format(host))
     return self.object_store_address
Exemplo n.º 9
0
    def __init__(self,
                 meta_data,
                 object_store_address,
                 workers_per_node=1,
                 batch_size=1):
        import pyarrow.plasma as plasma
        self.client = plasma.connect(object_store_address)
        print("Connected to plasma")

        # All the subpartitions on this node
        all_data = [
            subpartition for subpartition in meta_data
            if subpartition[4] == get_node_ip()
        ]
        rank = int(os.environ.get("PMI_RANK", 0))
        print("Global rank: ", rank)
        # rank = int(os.environ.get("PMIX_RANK", 0))  # For OpenMPI
        local_rank = rank % workers_per_node
        print("Local rank: ", local_rank)
        data_splits = list(chunks(all_data, len(all_data) // workers_per_node))
        worker_data = data_splits[local_rank]
        if len(data_splits) == (workers_per_node +
                                1):  # Can't evenly split among workers
            remain_data = data_splits[-1]
            if local_rank < len(remain_data):
                worker_data += [remain_data[local_rank]]
        self.object_ids = [subpartition[3] for subpartition in worker_data]
        self.sizes = [subpartition[2] for subpartition in worker_data]
        print("Data size for worker: ", sum(self.sizes))
        self.batch_size = batch_size
        offsets = []
        for i in self.sizes:
            if len(offsets) == 0:
                offsets.append(i)
            else:
                offsets.append(offsets[-1] + i)
        self.offsets = offsets
        self.current_index = 0  # Current index for object_id; data loaded
        self.load_from_plasma(self.current_index)
Exemplo n.º 10
0
    def init_spark_standalone(self,
                              num_executors,
                              executor_cores,
                              executor_memory="2g",
                              driver_cores=4,
                              driver_memory="1g",
                              master=None,
                              extra_executor_memory_for_ray=None,
                              extra_python_lib=None,
                              conf=None,
                              jars=None,
                              python_location=None,
                              enable_numa_binding=False):
        import subprocess
        import pyspark
        from bigdl.dllib.utils.utils import get_node_ip

        if "PYSPARK_PYTHON" not in os.environ:
            os.environ["PYSPARK_PYTHON"] = \
                python_location if python_location else detect_python_location()
        if not master:
            pyspark_home = os.path.abspath(pyspark.__file__ + "/../")
            zoo_standalone_home = os.path.abspath(__file__ + "/../../share/bin/standalone")
            node_ip = get_node_ip()
            SparkRunner.standalone_env = {
                "SPARK_HOME": pyspark_home,
                "ZOO_STANDALONE_HOME": zoo_standalone_home,
                # If not set this, by default master is hostname but not ip,
                "SPARK_MASTER_HOST": node_ip}
            if 'JAVA_HOME' in os.environ:
                SparkRunner.standalone_env["JAVA_HOME"] = os.environ["JAVA_HOME"]
            # The scripts installed from pip don't have execution permission
            # and need to first give them permission.
            pro = subprocess.Popen(["chmod", "-R", "+x", "{}/sbin".format(zoo_standalone_home)])
            os.waitpid(pro.pid, 0)
            # Start master
            start_master_pro = subprocess.Popen(
                "{}/sbin/start-master.sh".format(zoo_standalone_home),
                shell=True, env=SparkRunner.standalone_env)
            _, status = os.waitpid(start_master_pro.pid, 0)
            if status != 0:
                raise RuntimeError("starting master failed")
            master = "spark://{}:7077".format(node_ip)  # 7077 is the default port
            # Start worker
            if enable_numa_binding:
                worker_script = "start-worker-with-numactl.sh"
                SparkRunner.standalone_env["SPARK_WORKER_INSTANCES"] = str(num_executors)
            else:
                worker_script = "start-worker.sh"
            start_worker_pro = subprocess.Popen(
                "{}/sbin/{} {}".format(zoo_standalone_home, worker_script, master),
                shell=True, env=SparkRunner.standalone_env)
            _, status = os.waitpid(start_worker_pro.pid, 0)
            if status != 0:
                raise RuntimeError("starting worker failed")
        else:  # A Spark standalone cluster has already been started by the user.
            assert master.startswith("spark://"), \
                "Please input a valid master address for your Spark standalone cluster: " \
                "spark://master:port"

        # Start pyspark-shell
        submit_args = "--master " + master
        submit_args = submit_args + gen_submit_args(
            driver_cores, driver_memory, num_executors, executor_cores,
            executor_memory, extra_python_lib, jars)

        conf = enrich_conf_for_spark(conf, driver_cores, driver_memory, num_executors,
                                     executor_cores, executor_memory, extra_executor_memory_for_ray)
        conf.update({
            "spark.cores.max": num_executors * executor_cores,
            "spark.executorEnv.PYTHONHOME": "/".join(detect_python_location().split("/")[:-2])
        })
        zoo_bigdl_jar_path = ":".join(list(get_zoo_bigdl_classpath_on_driver()))
        if "spark.executor.extraClassPath" in conf:
            conf["spark.executor.extraClassPath"] = "{}:{}".format(
                zoo_bigdl_jar_path, conf["spark.executor.extraClassPath"])
        else:
            conf["spark.executor.extraClassPath"] = zoo_bigdl_jar_path

        sc = self.create_sc(submit_args, conf)
        return sc
Exemplo n.º 11
0
 def info_fn(iter):
     from bigdl.dllib.utils.utils import get_node_ip
     yield get_node_ip()
Exemplo n.º 12
0
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import argparse
import cloudpickle
from bigdl.dllib.utils.utils import get_node_ip

print("Worker on {} with global rank {}".format(get_node_ip(), os.environ.get("PMI_RANK", 0)))

parser = argparse.ArgumentParser()
parser.add_argument('--pkl_path', type=str, default="",
                    help='The directory of the pkl files for mpi training.')
args = parser.parse_args()
pkl_path = args.pkl_path

with open("{}/saved_mpi_estimator.pkl".format(pkl_path), "rb") as f:
    model_creator, optimizer_creator, loss_creator, metrics, \
        scheduler_creator, config, init_func = cloudpickle.load(f)

with open("{}/mpi_train_data.pkl".format(pkl_path), "rb") as f:
    train_data_creator, epochs, batch_size, validation_data_creator,\
        validate_batch_size, train_func, validate_func, train_batches, \
        validate_batches, validate_steps = cloudpickle.load(f)
Exemplo n.º 13
0
 def get_ip(iter):
     yield get_node_ip()