def on_start(self, worker_group: WorkerGroup, backend_config: HorovodConfig): # TODO(matt): Implement placement group strategies in BackendExecutor. # Initialize workers with Horovod environment variables setup_futures = [] for rank in range(len(worker_group)): worker_node_id = worker_group.workers[rank].metadata.node_id setup_futures.append( worker_group.execute_single_async( rank, init_env_vars, rank, len(worker_group), worker_node_id ) ) ray.get(setup_futures) # Use Horovod Ray Coordinator # backend_config as settings self.coordinator = Coordinator(backend_config) # Get all the hostnames of all workers node_ids = [w.metadata.node_id for w in worker_group.workers] hostnames = [w.metadata.hostname for w in worker_group.workers] # Register each hostname to the coordinator. assumes the hostname # ordering is the same. for rank, (hostname, node_id) in enumerate(zip(hostnames, node_ids)): self.coordinator.register(hostname, node_id, rank) all_info = self.coordinator.finalize_registration() setup_futures = [] for rank, local_cross_env_var in all_info.items(): setup_futures.append( worker_group.execute_single_async( rank, update_env_vars, local_cross_env_var ) ) ray.get(setup_futures) coordinator_envs = self.coordinator.establish_rendezvous() # Get one worker from each host/node. node_worker_indexes = [node_ids.index(node_id) for node_id in set(node_ids)] node_workers = [ HorovodWorkerWrapper(worker_group.workers[worker_index]) for worker_index in node_worker_indexes ] assert len(node_workers) == len(self.coordinator.hostnames) nics = detect_nics( backend_config, all_host_names=list(self.coordinator.hostnames), node_workers=node_workers, ) coordinator_envs.update(nics_to_env_var(nics)) worker_group.execute(update_env_vars, coordinator_envs)
def on_start(self, worker_group: WorkerGroup, backend_config: TorchConfig): if dist.is_available(): # Set the appropriate training backend. if backend_config.backend is None: if worker_group.num_gpus_per_worker > 0: backend = "nccl" else: backend = "gloo" else: backend = backend_config.backend master_addr, master_port = worker_group.execute_single( 0, get_address_and_port ) if backend_config.init_method == "env": def set_env_vars(addr, port): os.environ["MASTER_ADDR"] = addr os.environ["MASTER_PORT"] = str(port) worker_group.execute(set_env_vars, addr=master_addr, port=master_port) url = "env://" elif backend_config.init_method == "tcp": url = f"tcp://{master_addr}:{master_port}" else: raise ValueError( f"The provided init_method (" f"{backend_config.init_method}) is not supported. Must " f"be either 'env' or 'tcp'." ) setup_futures = [] for i in range(len(worker_group)): setup_futures.append( worker_group.execute_single_async( i, setup_torch_process_group, backend=backend, world_rank=i, world_size=len(worker_group), init_method=url, timeout_s=backend_config.timeout_s, ) ) ray.get(setup_futures) else: raise RuntimeError("Distributed torch is not available.")
def on_start(self, worker_group: WorkerGroup, backend_config: TensorflowConfig): # Compute URL for initializing distributed setup. def get_url(): address, port = get_address_and_port() return f"{address}:{port}" urls = worker_group.execute(get_url) # Get setup tasks in order to throw errors on failure. setup_futures = [] for i in range(len(worker_group)): setup_futures.append( worker_group.execute_single_async( i, setup_tensorflow_environment, worker_addresses=urls, index=i ) ) ray.get(setup_futures)