示例#1
0
    def __init__(self, ray_ctx, verbose=None, start_timeout=None):

        self.cores_per_node = ray_ctx.ray_node_cpu_cores
        self.num_nodes = ray_ctx.num_ray_nodes
        self.worker_class = make_horovod_worker(self.cores_per_node)
        self.remote_workers = [self.worker_class.remote() for i in range(0, self.num_nodes)]

        hosts = ray.get([worker.hostname.remote() for worker in self.remote_workers])
        hosts_spec, name_rank_to_id, host_to_size = _hosts_to_hosts_spec(hosts)
        self.host_alloc_plan = _allocate(",".join(hosts_spec), self.num_nodes)
        global_rendezv = RendezvousServer(True)
        global_rendezv_port = global_rendezv.start_server(self.host_alloc_plan)

        if start_timeout is None:
            start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30'))

        tmout = timeout.Timeout(start_timeout,
                                message='Timed out waiting for {activity}. Please '
                                        'check connectivity between servers. You '
                                        'may need to increase the --start-timeout '
                                        'parameter if you have too many servers.')

        all_host_names = [k for k in host_to_size]

        settings = hvd_settings.Settings(verbose=2 if verbose else 0,
                                         key=secret.make_secret_key(),
                                         timeout=tmout,
                                         num_hosts=len(all_host_names),
                                         num_proc=self.num_nodes,
                                         hosts=",".join(hosts_spec))

        common_intfs = _find_common_network_interface(host_to_size, name_rank_to_id,
                                                      self.remote_workers, settings)
        iface = list(common_intfs)[0]
        driver_ip = _get_driver_ip([iface])

        common_envs = {
            "HOROVOD_GLOO_RENDEZVOUS_ADDR": driver_ip,
            "HOROVOD_GLOO_RENDEZVOUS_PORT": str(global_rendezv_port),
            "HOROVOD_CONTROLLER": "gloo",
            "HOROVOD_CPU_OPERATIONS": "gloo",
            "HOROVOD_GLOO_IFACE": iface,
            "PYTHONUNBUFFERED": '1',
        }

        for key in os.environ:
            if key.startswith("HOROVOD"):
                common_envs[key] = os.environ[key]

        # todo support other Horovod envs
        self.per_worker_envs = [common_envs.copy() for _ in range(self.num_nodes)]
        for alloc_info in self.host_alloc_plan:
            key = (alloc_info.hostname, alloc_info.local_rank)
            local_envs = self.per_worker_envs[name_rank_to_id[key]]
            local_envs["HOROVOD_RANK"] = str(alloc_info.rank)
            local_envs["HOROVOD_SIZE"] = str(alloc_info.size)
            local_envs["HOROVOD_LOCAL_RANK"] = str(alloc_info.local_rank)
            local_envs["HOROVOD_LOCAL_SIZE"] = str(alloc_info.local_size)
            local_envs["HOROVOD_CROSS_RANK"] = str(alloc_info.cross_rank)
            local_envs["HOROVOD_CROSS_SIZE"] = str(alloc_info.cross_size)
示例#2
0
    def __init__(self,
                 ray_ctx,
                 worker_cls=None,
                 worker_param=None,
                 workers_per_node=1):
        from horovod.run.gloo_run import RendezvousServer, _allocate

        self.cores_per_node = ray_ctx.ray_node_cpu_cores // workers_per_node
        self.num_nodes = ray_ctx.num_ray_nodes * workers_per_node
        if worker_param is None:
            worker_param = {}
        worker_cls = make_worker(worker_cls, HorovodWorker)
        self.worker_class = ray.remote(
            num_cpus=self.cores_per_node)(worker_cls)
        self.remote_workers = [
            self.worker_class.remote(**worker_param)
            for i in range(0, self.num_nodes)
        ]
        hosts = ray.get(
            [worker.ip_addr.remote() for worker in self.remote_workers])
        hosts_spec, name_rank_to_id, host_to_size = _hosts_to_hosts_spec(hosts)
        self.host_alloc_plan = _allocate(",".join(hosts_spec), self.num_nodes)
        global_rendezv = RendezvousServer(True)
        global_rendezv_port = global_rendezv.start_server(self.host_alloc_plan)

        driver_ip = ray.services.get_node_ip_address()

        common_envs = {
            "HOROVOD_GLOO_RENDEZVOUS_ADDR": driver_ip,
            "HOROVOD_GLOO_RENDEZVOUS_PORT": str(global_rendezv_port),
            "HOROVOD_CONTROLLER": "gloo",
            "HOROVOD_CPU_OPERATIONS": "gloo",
            "PYTHONUNBUFFERED": '1',
            "OMP_NUM_THREADS": str(self.cores_per_node)
        }

        for key in os.environ:
            if key.startswith("HOROVOD"):
                common_envs[key] = os.environ[key]

        # todo support other Horovod envs
        self.per_worker_envs = [
            common_envs.copy() for _ in range(self.num_nodes)
        ]
        for alloc_info in self.host_alloc_plan:
            key = (alloc_info.hostname, alloc_info.local_rank)
            local_envs = self.per_worker_envs[name_rank_to_id[key]]
            local_envs["HOROVOD_RANK"] = str(alloc_info.rank)
            local_envs["HOROVOD_SIZE"] = str(alloc_info.size)
            local_envs["HOROVOD_LOCAL_RANK"] = str(alloc_info.local_rank)
            local_envs["HOROVOD_LOCAL_SIZE"] = str(alloc_info.local_size)
            local_envs["HOROVOD_CROSS_RANK"] = str(alloc_info.cross_rank)
            local_envs["HOROVOD_CROSS_SIZE"] = str(alloc_info.cross_size)

        ray.get(
            [worker.set_gloo_iface.remote() for worker in self.remote_workers])
        self.run(lambda: print("horovod worker initialized"))
示例#3
0
class HorovodRayRunner:

    # todo check whether horovod is built with gloo
    def __init__(self,
                 ray_ctx,
                 worker_cls=None,
                 worker_param=None,
                 workers_per_node=1):
        self.cores_per_node = ray_ctx.ray_node_cpu_cores // workers_per_node
        self.num_nodes = ray_ctx.num_ray_nodes * workers_per_node
        if worker_param is None:
            worker_param = {}
        worker_cls = make_worker(worker_cls, HorovodWorker)
        self.worker_class = ray.remote(
            num_cpus=self.cores_per_node)(worker_cls)
        self.remote_workers = [
            self.worker_class.remote(**worker_param)
            for i in range(0, self.num_nodes)
        ]
        hosts = ray.get(
            [worker.ip_addr.remote() for worker in self.remote_workers])
        hosts_spec, name_rank_to_id, host_to_size = _hosts_to_hosts_spec(hosts)

        major, minor, patch, version_str = get_horovod_version()

        if major == 0 and minor < 19:
            raise RuntimeError(f"We only support horovod versions newer "
                               f"than 0.19.0, but got {version_str}")
        if major == 0 and minor == 19:
            from horovod.run.gloo_run import RendezvousServer, _allocate
            self.host_alloc_plan = _allocate(",".join(hosts_spec),
                                             self.num_nodes)
            self.global_rendezv = RendezvousServer(True)
            global_rendezv_port = self.global_rendezv.start_server(
                self.host_alloc_plan)
        else:
            from horovod.runner.gloo_run import RendezvousServer, parse_hosts, get_host_assignments
            self.host_alloc_plan = get_host_assignments(
                parse_hosts(",".join(hosts_spec)), self.num_nodes)
            self.global_rendezv = RendezvousServer(True)
            global_rendezv_port = self.global_rendezv.start()
            self.global_rendezv.init(self.host_alloc_plan)

        driver_ip = ray._private.services.get_node_ip_address()

        common_envs = {
            "HOROVOD_GLOO_RENDEZVOUS_ADDR": driver_ip,
            "HOROVOD_GLOO_RENDEZVOUS_PORT": str(global_rendezv_port),
            "HOROVOD_CONTROLLER": "gloo",
            "HOROVOD_CPU_OPERATIONS": "gloo",
            "PYTHONUNBUFFERED": '1',
            "OMP_NUM_THREADS": str(self.cores_per_node)
        }

        for key in os.environ:
            if key.startswith("HOROVOD"):
                common_envs[key] = os.environ[key]

        # todo support other Horovod envs
        self.per_worker_envs = [
            common_envs.copy() for _ in range(self.num_nodes)
        ]
        for alloc_info in self.host_alloc_plan:
            key = (alloc_info.hostname, alloc_info.local_rank)
            local_envs = self.per_worker_envs[name_rank_to_id[key]]
            local_envs["HOROVOD_HOSTNAME"] = str(alloc_info.hostname)
            local_envs["HOROVOD_RANK"] = str(alloc_info.rank)
            local_envs["HOROVOD_SIZE"] = str(alloc_info.size)
            local_envs["HOROVOD_LOCAL_RANK"] = str(alloc_info.local_rank)
            local_envs["HOROVOD_LOCAL_SIZE"] = str(alloc_info.local_size)
            local_envs["HOROVOD_CROSS_RANK"] = str(alloc_info.cross_rank)
            local_envs["HOROVOD_CROSS_SIZE"] = str(alloc_info.cross_size)

        ray.get(
            [worker.set_gloo_iface.remote() for worker in self.remote_workers])
        self.run(lambda: print("horovod worker initialized"))

    def run(self, func):
        return ray.get([
            self.remote_workers[i].run.remote(self.per_worker_envs[i], func)
            for i in range(self.num_nodes)
        ])