def __init__(self, ray_ctx, verbose=None, start_timeout=None): self.cores_per_node = ray_ctx.ray_node_cpu_cores self.num_nodes = ray_ctx.num_ray_nodes self.worker_class = make_horovod_worker(self.cores_per_node) self.remote_workers = [self.worker_class.remote() for i in range(0, self.num_nodes)] hosts = ray.get([worker.hostname.remote() for worker in self.remote_workers]) hosts_spec, name_rank_to_id, host_to_size = _hosts_to_hosts_spec(hosts) self.host_alloc_plan = _allocate(",".join(hosts_spec), self.num_nodes) global_rendezv = RendezvousServer(True) global_rendezv_port = global_rendezv.start_server(self.host_alloc_plan) if start_timeout is None: start_timeout = int(os.getenv('HOROVOD_START_TIMEOUT', '30')) tmout = timeout.Timeout(start_timeout, message='Timed out waiting for {activity}. Please ' 'check connectivity between servers. You ' 'may need to increase the --start-timeout ' 'parameter if you have too many servers.') all_host_names = [k for k in host_to_size] settings = hvd_settings.Settings(verbose=2 if verbose else 0, key=secret.make_secret_key(), timeout=tmout, num_hosts=len(all_host_names), num_proc=self.num_nodes, hosts=",".join(hosts_spec)) common_intfs = _find_common_network_interface(host_to_size, name_rank_to_id, self.remote_workers, settings) iface = list(common_intfs)[0] driver_ip = _get_driver_ip([iface]) common_envs = { "HOROVOD_GLOO_RENDEZVOUS_ADDR": driver_ip, "HOROVOD_GLOO_RENDEZVOUS_PORT": str(global_rendezv_port), "HOROVOD_CONTROLLER": "gloo", "HOROVOD_CPU_OPERATIONS": "gloo", "HOROVOD_GLOO_IFACE": iface, "PYTHONUNBUFFERED": '1', } for key in os.environ: if key.startswith("HOROVOD"): common_envs[key] = os.environ[key] # todo support other Horovod envs self.per_worker_envs = [common_envs.copy() for _ in range(self.num_nodes)] for alloc_info in self.host_alloc_plan: key = (alloc_info.hostname, alloc_info.local_rank) local_envs = self.per_worker_envs[name_rank_to_id[key]] local_envs["HOROVOD_RANK"] = str(alloc_info.rank) local_envs["HOROVOD_SIZE"] = str(alloc_info.size) local_envs["HOROVOD_LOCAL_RANK"] = str(alloc_info.local_rank) local_envs["HOROVOD_LOCAL_SIZE"] = str(alloc_info.local_size) local_envs["HOROVOD_CROSS_RANK"] = str(alloc_info.cross_rank) local_envs["HOROVOD_CROSS_SIZE"] = str(alloc_info.cross_size)
def __init__(self, ray_ctx, worker_cls=None, worker_param=None, workers_per_node=1): from horovod.run.gloo_run import RendezvousServer, _allocate self.cores_per_node = ray_ctx.ray_node_cpu_cores // workers_per_node self.num_nodes = ray_ctx.num_ray_nodes * workers_per_node if worker_param is None: worker_param = {} worker_cls = make_worker(worker_cls, HorovodWorker) self.worker_class = ray.remote( num_cpus=self.cores_per_node)(worker_cls) self.remote_workers = [ self.worker_class.remote(**worker_param) for i in range(0, self.num_nodes) ] hosts = ray.get( [worker.ip_addr.remote() for worker in self.remote_workers]) hosts_spec, name_rank_to_id, host_to_size = _hosts_to_hosts_spec(hosts) self.host_alloc_plan = _allocate(",".join(hosts_spec), self.num_nodes) global_rendezv = RendezvousServer(True) global_rendezv_port = global_rendezv.start_server(self.host_alloc_plan) driver_ip = ray.services.get_node_ip_address() common_envs = { "HOROVOD_GLOO_RENDEZVOUS_ADDR": driver_ip, "HOROVOD_GLOO_RENDEZVOUS_PORT": str(global_rendezv_port), "HOROVOD_CONTROLLER": "gloo", "HOROVOD_CPU_OPERATIONS": "gloo", "PYTHONUNBUFFERED": '1', "OMP_NUM_THREADS": str(self.cores_per_node) } for key in os.environ: if key.startswith("HOROVOD"): common_envs[key] = os.environ[key] # todo support other Horovod envs self.per_worker_envs = [ common_envs.copy() for _ in range(self.num_nodes) ] for alloc_info in self.host_alloc_plan: key = (alloc_info.hostname, alloc_info.local_rank) local_envs = self.per_worker_envs[name_rank_to_id[key]] local_envs["HOROVOD_RANK"] = str(alloc_info.rank) local_envs["HOROVOD_SIZE"] = str(alloc_info.size) local_envs["HOROVOD_LOCAL_RANK"] = str(alloc_info.local_rank) local_envs["HOROVOD_LOCAL_SIZE"] = str(alloc_info.local_size) local_envs["HOROVOD_CROSS_RANK"] = str(alloc_info.cross_rank) local_envs["HOROVOD_CROSS_SIZE"] = str(alloc_info.cross_size) ray.get( [worker.set_gloo_iface.remote() for worker in self.remote_workers]) self.run(lambda: print("horovod worker initialized"))
class HorovodRayRunner: # todo check whether horovod is built with gloo def __init__(self, ray_ctx, worker_cls=None, worker_param=None, workers_per_node=1): self.cores_per_node = ray_ctx.ray_node_cpu_cores // workers_per_node self.num_nodes = ray_ctx.num_ray_nodes * workers_per_node if worker_param is None: worker_param = {} worker_cls = make_worker(worker_cls, HorovodWorker) self.worker_class = ray.remote( num_cpus=self.cores_per_node)(worker_cls) self.remote_workers = [ self.worker_class.remote(**worker_param) for i in range(0, self.num_nodes) ] hosts = ray.get( [worker.ip_addr.remote() for worker in self.remote_workers]) hosts_spec, name_rank_to_id, host_to_size = _hosts_to_hosts_spec(hosts) major, minor, patch, version_str = get_horovod_version() if major == 0 and minor < 19: raise RuntimeError(f"We only support horovod versions newer " f"than 0.19.0, but got {version_str}") if major == 0 and minor == 19: from horovod.run.gloo_run import RendezvousServer, _allocate self.host_alloc_plan = _allocate(",".join(hosts_spec), self.num_nodes) self.global_rendezv = RendezvousServer(True) global_rendezv_port = self.global_rendezv.start_server( self.host_alloc_plan) else: from horovod.runner.gloo_run import RendezvousServer, parse_hosts, get_host_assignments self.host_alloc_plan = get_host_assignments( parse_hosts(",".join(hosts_spec)), self.num_nodes) self.global_rendezv = RendezvousServer(True) global_rendezv_port = self.global_rendezv.start() self.global_rendezv.init(self.host_alloc_plan) driver_ip = ray._private.services.get_node_ip_address() common_envs = { "HOROVOD_GLOO_RENDEZVOUS_ADDR": driver_ip, "HOROVOD_GLOO_RENDEZVOUS_PORT": str(global_rendezv_port), "HOROVOD_CONTROLLER": "gloo", "HOROVOD_CPU_OPERATIONS": "gloo", "PYTHONUNBUFFERED": '1', "OMP_NUM_THREADS": str(self.cores_per_node) } for key in os.environ: if key.startswith("HOROVOD"): common_envs[key] = os.environ[key] # todo support other Horovod envs self.per_worker_envs = [ common_envs.copy() for _ in range(self.num_nodes) ] for alloc_info in self.host_alloc_plan: key = (alloc_info.hostname, alloc_info.local_rank) local_envs = self.per_worker_envs[name_rank_to_id[key]] local_envs["HOROVOD_HOSTNAME"] = str(alloc_info.hostname) local_envs["HOROVOD_RANK"] = str(alloc_info.rank) local_envs["HOROVOD_SIZE"] = str(alloc_info.size) local_envs["HOROVOD_LOCAL_RANK"] = str(alloc_info.local_rank) local_envs["HOROVOD_LOCAL_SIZE"] = str(alloc_info.local_size) local_envs["HOROVOD_CROSS_RANK"] = str(alloc_info.cross_rank) local_envs["HOROVOD_CROSS_SIZE"] = str(alloc_info.cross_size) ray.get( [worker.set_gloo_iface.remote() for worker in self.remote_workers]) self.run(lambda: print("horovod worker initialized")) def run(self, func): return ray.get([ self.remote_workers[i].run.remote(self.per_worker_envs[i], func) for i in range(self.num_nodes) ])