def from_dict(pg_dict: dict) -> "PlacementGroup": """Instantiate and return a PlacementGroup from its json-serializable dict representation. Used by Ray Client on server-side to deserialize placement group option. See decode_options in util/client/server/server.py. Args: serializable_form(dict): Dictionary representing a placement group. Return: A placement group made from the data in the input dict. """ # Validate serialized dict assert isinstance(pg_dict, dict) assert pg_dict.keys() == {"id", "bundle_cache"} # The value associated to key "id" is a hex string. assert isinstance(pg_dict["id"], str) if pg_dict["bundle_cache"] is not None: assert isinstance(pg_dict["bundle_cache"], list) # Deserialize and return a Placement Group. id_bytes = bytes.fromhex(pg_dict["id"]) pg_id = PlacementGroupID(id_bytes) bundle_cache = pg_dict["bundle_cache"] return PlacementGroup(pg_id, bundle_cache)
def test_remove_placement_group(ray_start_cluster): cluster = ray_start_cluster cluster.add_node(num_cpus=4) ray.init(address=cluster.address) # First try to remove a placement group that doesn't # exist. This should not do anything. random_placement_group_id = PlacementGroupID.from_random() for _ in range(3): ray.experimental.remove_placement_group(random_placement_group_id) # Creating a placement group as soon as it is # created should work. pid = ray.experimental.placement_group([{"CPU": 2}, {"CPU": 2}]) ray.experimental.remove_placement_group(pid) def is_placement_group_removed(): table = ray.experimental.placement_group_table(pid) if "state" not in table: return False return table["state"] == "REMOVED" wait_for_condition(is_placement_group_removed) # # Now let's create a placement group. pid = ray.experimental.placement_group([{"CPU": 2}, {"CPU": 2}]) # # This is a hack to wait for placement group creation. # # TODO(sang): Remove it when wait is implemented. @ray.remote(num_cpus=0) class A: def f(self): return 3 a = A.options(placement_group_id=pid).remote() assert ray.get(a.f.remote()) == 3 ray.experimental.remove_placement_group(pid) # # Subsequent remove request shouldn't do anything for _ in range(3): ray.experimental.remove_placement_group(pid) # # Make sure placement group resources are # # released and we can schedule this task. @ray.remote(num_cpus=4) def f(): return 3 assert ray.get(f.remote()) == 3
def get_placement_group(placement_group_name: str) -> PlacementGroup: """Get a placement group object with a global name. Returns: None if can't find a placement group with the given name. The placement group object otherwise. """ if not placement_group_name: raise ValueError("Please supply a non-empty value to get_placement_group") worker = ray._private.worker.global_worker worker.check_connected() placement_group_info = ray._private.state.state.get_placement_group_by_name( placement_group_name, worker.namespace ) if placement_group_info is None: raise ValueError(f"Failed to look up actor with name: {placement_group_name}") else: return PlacementGroup( PlacementGroupID(hex_to_binary(placement_group_info["placement_group_id"])) )
def test_placement_group_client_option_serialization(): """Tests conversion of placement group to json-serializable dict and back. Tests conversion placement_group -> dict -> placement_group and dict -> placement_group -> dict with and without non-null bundle cache. """ # Tests conversion from dict to placement group and back. def dict_to_pg_to_dict(pg_dict_in): pg = PlacementGroup.from_dict(pg_dict_in) pg_dict_out = pg.to_dict() assert pg_dict_in == pg_dict_out # Tests conversion from placement group to dict and back. def pg_to_dict_to_pg(pg_in): pg_dict = pg_in.to_dict() pg_out = PlacementGroup.from_dict(pg_dict) assert pg_out.id == pg_in.id assert pg_out.bundle_cache == pg_in.bundle_cache pg_id = PlacementGroupID(id=bytes(16)) id_string = bytes(16).hex() bundle_cache = [{"CPU": 2}, {"custom_resource": 5}] pg_with_bundles = PlacementGroup(id=pg_id, bundle_cache=bundle_cache) pg_to_dict_to_pg(pg_with_bundles) pg_no_bundles = PlacementGroup(id=pg_id) pg_to_dict_to_pg(pg_no_bundles) pg_dict_with_bundles = {"id": id_string, "bundle_cache": bundle_cache} dict_to_pg_to_dict(pg_dict_with_bundles) pg_dict_no_bundles = {"id": id_string, "bundle_cache": None} dict_to_pg_to_dict(pg_dict_no_bundles)
def test_remove_placement_group(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_cpus=4) ray.init(address=cluster.address) @ray.remote def warmup(): pass # warm up the cluster. ray.get([warmup.remote() for _ in range(4)]) with connect_to_client_or_not(connect_to_client): # First try to remove a placement group that doesn't # exist. This should not do anything. random_group_id = PlacementGroupID.from_random() random_placement_group = PlacementGroup(random_group_id) for _ in range(3): ray.util.remove_placement_group(random_placement_group) # Creating a placement group as soon as it is # created should work. placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}]) assert placement_group.wait(10) ray.util.remove_placement_group(placement_group) wait_for_condition(lambda: is_placement_group_removed(placement_group)) # # Now let's create a placement group. placement_group = ray.util.placement_group([{"CPU": 2}, {"CPU": 2}]) assert placement_group.wait(10) # Create an actor that occupies resources. @ray.remote(num_cpus=2) class A: def f(self): return 3 # Currently, there's no way to prevent # tasks to be retried for removed placement group. # Set max_retrie=0 for testing. # TODO(sang): Handle this edge case. @ray.remote(num_cpus=2, max_retries=0) def long_running_task(): print(os.getpid()) import time time.sleep(50) # Schedule a long running task and actor. task_ref = long_running_task.options( placement_group=placement_group).remote() a = A.options(placement_group=placement_group).remote() assert ray.get(a.f.remote()) == 3 ray.util.remove_placement_group(placement_group) # Subsequent remove request shouldn't do anything. for _ in range(3): ray.util.remove_placement_group(placement_group) # Make sure placement group resources are # released and we can schedule this task. @ray.remote(num_cpus=4) def f(): return 3 assert ray.get(f.remote()) == 3 # Since the placement group is removed, # the actor should've been killed. # That means this request should fail. with pytest.raises(ray.exceptions.RayActorError, match="actor died"): ray.get(a.f.remote(), timeout=3.0) with pytest.raises(ray.exceptions.WorkerCrashedError): ray.get(task_ref)
def empty() -> "PlacementGroup": return PlacementGroup(PlacementGroupID.nil())
def empty(): return PlacementGroup(PlacementGroupID.nil())