def clean_up(): group_names = ["default", "test", "123?34!", "default2", "random"] group_names.extend([str(i) for i in range(10)]) max_world_size = 4 all_keys = [] for name in group_names: devices = [[0], [0, 1], [1, 0]] for d in devices: collective_communicator_key = _get_comm_key_from_devices(d) all_keys.append(collective_communicator_key + "@" + name) for i in range(max_world_size): for j in range(max_world_size): if i < j: p2p_communicator_key = _get_comm_key_send_recv(i, 0, j, 0) all_keys.append(p2p_communicator_key + "@" + name) for group_key in all_keys: store_name = get_nccl_store_name(group_key) try: actor = ray.get_actor(store_name) except ValueError: actor = None if actor: logger.debug("Killing actor with group_key: '{}' and store: '{}'." .format(group_key, store_name)) ray.kill(actor)
def meet(self, timeout_s=180): """Meet at the named actor store. Args: timeout_s: timeout in seconds. Return: None """ if timeout_s <= 0: raise ValueError("The 'timeout' argument must be positive. " "Got '{}'.".format(timeout_s)) self._store_name = get_nccl_store_name(self._group_name) timeout_delta = datetime.timedelta(seconds=timeout_s) elapsed = datetime.timedelta(seconds=0) start_time = datetime.datetime.now() while elapsed < timeout_delta: try: logger.debug("Trying to meet at the store '{}'".format( self._store_name)) self._store = ray.get_actor(self._store_name) except ValueError: logger.debug("Failed to meet at the store '{}'." "Trying again...".format(self._store_name)) time.sleep(1) elapsed = datetime.datetime.now() - start_time continue logger.debug("Successful rendezvous!") break if not self._store: raise RuntimeError("Unable to meet other processes " "at the rendezvous store.")
def create_collective_group(self, backend, world_size, rank, group_name): """The entry to create new collective groups in the manager. Put the registration and the group information into the manager metadata as well. """ backend = types.Backend(backend) if backend == types.Backend.MPI: raise NotImplementedError() elif backend == types.Backend.NCCL: # create the ncclUniqueID if rank == 0: # availability has been checked before entering here. group_uid = nccl_util.get_nccl_unique_id() store_name = get_nccl_store_name(group_name) # Avoid a potential circular dependency in ray/actor.py from ray.util.collective.util import NCCLUniqueIDStore store = NCCLUniqueIDStore.options( name=store_name, lifetime="detached").remote(store_name) ray.wait([store.set_id.remote(group_uid)]) logger.debug("creating NCCL group: '{}'".format(group_name)) g = NCCLGroup(world_size, rank, group_name) self._name_group_map[group_name] = g self._group_name_map[g] = group_name return self._name_group_map[group_name]
def clean_up(): group_names = ["default", "test", "123?34!", "default2", "random"] group_names.extend([str(i) for i in range(10)]) for group_name in group_names: try: store_name = get_nccl_store_name(group_name) actor = ray.get_actor(store_name) except ValueError: actor = None if actor: ray.kill(actor)
def _destroy_store(group_key): """Destroy the KV store (Ray named actor). Args: group_key (str): the unique key to retrieve the KV store. Returns: None """ store_name = get_nccl_store_name(group_key) store = ray.get_actor(store_name) # ray.get([store.__ray_terminate__.remote()]) ray.kill(store)
def clean_up(): group_names = ["default", "test", "123?34!", "default2", "random"] group_names.extend([str(i) for i in range(10)]) max_world_size = 4 p2p_group_names = [] for name in group_names: for i in range(max_world_size): for j in range(max_world_size): if i <= j: p2p_group_name = name + "_" + str(i) + "_" + str(j) p2p_group_names.append(p2p_group_name) all_names = group_names + p2p_group_names for group_name in all_names: store_name = get_nccl_store_name(group_name) try: actor = ray.get_actor(store_name) except ValueError: actor = None if actor: ray.kill(actor)
def _generate_nccl_uid(self, key): """Generate an NCCL unique ID for initializing communicators. The method will also create a KV store using Ray named actor and store the NCCLUniqueID in the store. The store needs to be garbage collected when destroying the collective group. Args: key (str): the key of the . Returns: NCCLUniqueID (str): NCCL unique ID. """ group_uid = nccl_util.get_nccl_unique_id() store_name = get_nccl_store_name(key) # Avoid a potential circular dependency in ray/actor.py from ray.util.collective.util import NCCLUniqueIDStore store = NCCLUniqueIDStore.options( name=store_name, lifetime="detached").remote(store_name) ray.get([store.set_id.remote(group_uid)]) return group_uid
def destroy_collective_group(self, group_name): """Group destructor.""" if not self.is_group_exist(group_name): logger.warning("The group '{}' does not exist.".format(group_name)) return # release the collective group resource g = self._name_group_map[group_name] rank = g.rank backend = g.backend() # clean up the dicts del self._group_name_map[g] del self._name_group_map[group_name] if backend == types.Backend.NCCL: # release the named actor if rank == 0: store_name = get_nccl_store_name(group_name) store = ray.get_actor(store_name) ray.wait([store.__ray_terminate__.remote()]) ray.kill(store) # Release the communicator resources g.destroy_group()