def objective(*args): # Tell Datasets to use the current placement group for all Datasets tasks. ctx = DatasetContext.get_current() ctx.scheduling_strategy = PlacementGroupSchedulingStrategy( ray.util.get_current_placement_group()) # This Dataset workload will use that placement group for all read and map tasks. ray.data.range(10).show()
def test_default_scheduling_strategy(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_cpus=16, resources={"head": 1}, _system_config={"scheduler_spread_threshold": 1}) cluster.add_node(num_cpus=8, num_gpus=8, resources={"worker": 1}) cluster.wait_for_nodes() ray.init(address=cluster.address) pg = ray.util.placement_group(bundles=[{ "CPU": 1, "GPU": 1 }, { "CPU": 1, "GPU": 1 }]) ray.get(pg.ready()) ray.get(pg.ready()) with connect_to_client_or_not(connect_to_client): @ray.remote(scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY) def get_node_id_1(): return ray.worker.global_worker.current_node_id head_node_id = ray.get( get_node_id_1.options(resources={ "head": 1 }).remote()) worker_node_id = ray.get( get_node_id_1.options(resources={ "worker": 1 }).remote()) assert ray.get(get_node_id_1.remote()) == head_node_id @ray.remote(num_cpus=1, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)) def get_node_id_2(): return ray.worker.global_worker.current_node_id assert ray.get( get_node_id_2.options( scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY).remote() ) == head_node_id @ray.remote def get_node_id_3(): return ray.worker.global_worker.current_node_id @ray.remote(num_cpus=1, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg, placement_group_capture_child_tasks=True)) class Actor1(): def get_node_ids(self): return [ ray.worker.global_worker.current_node_id, # Use parent's placement group ray.get(get_node_id_3.remote()), ray.get( get_node_id_3.options( scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY). remote()) ] actor1 = Actor1.remote() assert ray.get(actor1.get_node_ids.remote()) == \ [worker_node_id, worker_node_id, head_node_id]
def test_placement_group_scheduling_strategy(ray_start_cluster, connect_to_client): cluster = ray_start_cluster cluster.add_node(num_cpus=8, resources={"head": 1}) cluster.add_node(num_cpus=8, num_gpus=8, resources={"worker": 1}) cluster.wait_for_nodes() ray.init(address=cluster.address) pg = ray.util.placement_group(bundles=[{ "CPU": 1, "GPU": 1 }, { "CPU": 1, "GPU": 1 }]) ray.get(pg.ready()) with connect_to_client_or_not(connect_to_client): @ray.remote(scheduling_strategy=DEFAULT_SCHEDULING_STRATEGY) def get_node_id_1(): return ray.worker.global_worker.current_node_id worker_node_id = ray.get( get_node_id_1.options(resources={ "worker": 1 }).remote()) assert ray.get( get_node_id_1.options( num_cpus=1, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)).remote()) == worker_node_id @ray.remote(num_cpus=1, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)) def get_node_id_2(): return ray.worker.global_worker.current_node_id assert ray.get(get_node_id_2.remote()) == worker_node_id @ray.remote(num_cpus=1, scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)) class Actor1(): def get_node_id(self): return ray.worker.global_worker.current_node_id actor1 = Actor1.remote() assert ray.get(actor1.get_node_id.remote()) == worker_node_id @ray.remote class Actor2(): def get_node_id(self): return ray.worker.global_worker.current_node_id actor2 = Actor2.options( scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)).remote() assert ray.get(actor2.get_node_id.remote()) == worker_node_id with pytest.raises(ValueError): @ray.remote(scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)) def func(): return 0 func.options(placement_group=pg).remote() with pytest.raises(ValueError): @ray.remote def func(): return 0 func.options(scheduling_strategy="XXX").remote() with pytest.raises(ValueError): @ray.remote def func(): return 0 func.options(scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=None)).remote()
def _remote(self, args=None, kwargs=None, **task_options): """Submit the remote function for execution.""" # We pop the "max_calls" coming from "@ray.remote" here. We no longer need # it in "_remote()". task_options.pop("max_calls", None) if client_mode_should_convert(auto_init=True): return client_mode_convert_function(self, args, kwargs, **task_options) worker = ray.worker.global_worker worker.check_connected() # If this function was not exported in this session and job, we need to # export this function again, because the current GCS doesn't have it. if (not self._is_cross_language and self._last_export_session_and_job != worker.current_session_and_job): self._function_descriptor = PythonFunctionDescriptor.from_function( self._function, self._uuid) # There is an interesting question here. If the remote function is # used by a subsequent driver (in the same script), should the # second driver pickle the function again? If yes, then the remote # function definition can differ in the second driver (e.g., if # variables in its closure have changed). We probably want the # behavior of the remote function in the second driver to be # independent of whether or not the function was invoked by the # first driver. This is an argument for repickling the function, # which we do here. try: self._pickled_function = pickle.dumps(self._function) except TypeError as e: msg = ( "Could not serialize the function " f"{self._function_descriptor.repr}. Check " "https://docs.ray.io/en/master/ray-core/objects/serialization.html#troubleshooting " # noqa "for more information.") raise TypeError(msg) from e self._last_export_session_and_job = worker.current_session_and_job worker.function_actor_manager.export(self) kwargs = {} if kwargs is None else kwargs args = [] if args is None else args # fill task required options for k, v in ray_option_utils.task_options.items(): task_options[k] = task_options.get(k, v.default_value) # "max_calls" already takes effects and should not apply again. # Remove the default value here. task_options.pop("max_calls", None) # TODO(suquark): cleanup these fields name = task_options["name"] runtime_env = parse_runtime_env(task_options["runtime_env"]) placement_group = task_options["placement_group"] placement_group_bundle_index = task_options[ "placement_group_bundle_index"] placement_group_capture_child_tasks = task_options[ "placement_group_capture_child_tasks"] scheduling_strategy = task_options["scheduling_strategy"] num_returns = task_options["num_returns"] max_retries = task_options["max_retries"] retry_exceptions = task_options["retry_exceptions"] resources = ray._private.utils.resources_from_ray_options(task_options) if scheduling_strategy is None or isinstance( scheduling_strategy, PlacementGroupSchedulingStrategy): if isinstance(scheduling_strategy, PlacementGroupSchedulingStrategy): placement_group = scheduling_strategy.placement_group placement_group_bundle_index = ( scheduling_strategy.placement_group_bundle_index) placement_group_capture_child_tasks = ( scheduling_strategy.placement_group_capture_child_tasks) if placement_group_capture_child_tasks is None: placement_group_capture_child_tasks = ( worker.should_capture_child_tasks_in_placement_group) placement_group = configure_placement_group_based_on_context( placement_group_capture_child_tasks, placement_group_bundle_index, resources, {}, # no placement_resources for tasks self._function_descriptor.function_name, placement_group=placement_group, ) if not placement_group.is_empty: scheduling_strategy = PlacementGroupSchedulingStrategy( placement_group, placement_group_bundle_index, placement_group_capture_child_tasks, ) else: scheduling_strategy = "DEFAULT" serialized_runtime_env_info = None if runtime_env is not None: serialized_runtime_env_info = get_runtime_env_info( runtime_env, is_job_runtime_env=False, serialize=True, ) def invocation(args, kwargs): if self._is_cross_language: list_args = cross_language.format_args(worker, args, kwargs) elif not args and not kwargs and not self._function_signature: list_args = [] else: list_args = ray._private.signature.flatten_args( self._function_signature, args, kwargs) if worker.mode == ray.worker.LOCAL_MODE: assert ( not self._is_cross_language ), "Cross language remote function cannot be executed locally." object_refs = worker.core_worker.submit_task( self._language, self._function_descriptor, list_args, name if name is not None else "", num_returns, resources, max_retries, retry_exceptions, scheduling_strategy, worker.debugger_breakpoint, serialized_runtime_env_info or "{}", ) # Reset worker's debug context from the last "remote" command # (which applies only to this .remote call). worker.debugger_breakpoint = b"" if len(object_refs) == 1: return object_refs[0] elif len(object_refs) > 1: return object_refs if self._decorator is not None: invocation = self._decorator(invocation) return invocation(args, kwargs)
def g(*a): ctx = DatasetContext.get_current() ctx.scheduling_strategy = PlacementGroupSchedulingStrategy( ray.util.get_current_placement_group()) ray.data.range(10).show()
def _remote( self, args=None, kwargs=None, num_returns=None, num_cpus=None, num_gpus=None, memory=None, object_store_memory=None, accelerator_type=None, resources=None, max_retries=None, retry_exceptions=None, placement_group="default", placement_group_bundle_index=-1, placement_group_capture_child_tasks=None, runtime_env=None, name="", scheduling_strategy: SchedulingStrategyT = None, ): """Submit the remote function for execution.""" if client_mode_should_convert(auto_init=True): return client_mode_convert_function( self, args, kwargs, num_returns=num_returns, num_cpus=num_cpus, num_gpus=num_gpus, memory=memory, object_store_memory=object_store_memory, accelerator_type=accelerator_type, resources=resources, max_retries=max_retries, retry_exceptions=retry_exceptions, placement_group=placement_group, placement_group_bundle_index=placement_group_bundle_index, placement_group_capture_child_tasks=( placement_group_capture_child_tasks), runtime_env=runtime_env, name=name, scheduling_strategy=scheduling_strategy, ) worker = ray.worker.global_worker worker.check_connected() # If this function was not exported in this session and job, we need to # export this function again, because the current GCS doesn't have it. if (not self._is_cross_language and self._last_export_session_and_job != worker.current_session_and_job): self._function_descriptor = PythonFunctionDescriptor.from_function( self._function, self._uuid) # There is an interesting question here. If the remote function is # used by a subsequent driver (in the same script), should the # second driver pickle the function again? If yes, then the remote # function definition can differ in the second driver (e.g., if # variables in its closure have changed). We probably want the # behavior of the remote function in the second driver to be # independent of whether or not the function was invoked by the # first driver. This is an argument for repickling the function, # which we do here. try: self._pickled_function = pickle.dumps(self._function) except TypeError as e: msg = ( "Could not serialize the function " f"{self._function_descriptor.repr}. Check " "https://docs.ray.io/en/master/serialization.html#troubleshooting " # noqa "for more information.") raise TypeError(msg) from e self._last_export_session_and_job = worker.current_session_and_job worker.function_actor_manager.export(self) kwargs = {} if kwargs is None else kwargs args = [] if args is None else args if num_returns is None: num_returns = self._num_returns if max_retries is None: max_retries = self._max_retries if retry_exceptions is None: retry_exceptions = self._retry_exceptions if scheduling_strategy is None: scheduling_strategy = self._scheduling_strategy resources = ray._private.utils.resources_from_resource_arguments( self._num_cpus, self._num_gpus, self._memory, self._object_store_memory, self._resources, self._accelerator_type, num_cpus, num_gpus, memory, object_store_memory, resources, accelerator_type, ) if (placement_group != "default") and (scheduling_strategy is not None): raise ValueError("Placement groups should be specified via the " "scheduling_strategy option. " "The placement_group option is deprecated.") if scheduling_strategy is None or isinstance( scheduling_strategy, PlacementGroupSchedulingStrategy): if isinstance(scheduling_strategy, PlacementGroupSchedulingStrategy): placement_group = scheduling_strategy.placement_group placement_group_bundle_index = ( scheduling_strategy.placement_group_bundle_index) placement_group_capture_child_tasks = ( scheduling_strategy.placement_group_capture_child_tasks) if placement_group_capture_child_tasks is None: placement_group_capture_child_tasks = ( worker.should_capture_child_tasks_in_placement_group) if placement_group == "default": placement_group = self._placement_group placement_group = configure_placement_group_based_on_context( placement_group_capture_child_tasks, placement_group_bundle_index, resources, {}, # no placement_resources for tasks self._function_descriptor.function_name, placement_group=placement_group, ) if not placement_group.is_empty: scheduling_strategy = PlacementGroupSchedulingStrategy( placement_group, placement_group_bundle_index, placement_group_capture_child_tasks, ) else: scheduling_strategy = DEFAULT_SCHEDULING_STRATEGY if not runtime_env or runtime_env == "{}": runtime_env = self._runtime_env def invocation(args, kwargs): if self._is_cross_language: list_args = cross_language.format_args(worker, args, kwargs) elif not args and not kwargs and not self._function_signature: list_args = [] else: list_args = ray._private.signature.flatten_args( self._function_signature, args, kwargs) if worker.mode == ray.worker.LOCAL_MODE: assert not self._is_cross_language, ( "Cross language remote function " "cannot be executed locally.") object_refs = worker.core_worker.submit_task( self._language, self._function_descriptor, list_args, name, num_returns, resources, max_retries, retry_exceptions, scheduling_strategy, worker.debugger_breakpoint, runtime_env or "{}", ) # Reset worker's debug context from the last "remote" command # (which applies only to this .remote call). worker.debugger_breakpoint = b"" if len(object_refs) == 1: return object_refs[0] elif len(object_refs) > 1: return object_refs if self._decorator is not None: invocation = self._decorator(invocation) return invocation(args, kwargs)
ray.init(num_cpus=1) ctx = DatasetContext.get_current() # Create a placement group that takes up the single core on the cluster. placement_group = ray.util.placement_group( name="core_hog", strategy="SPREAD", bundles=[ { "CPU": 1 }, ], ) ray.get(placement_group.ready()) # Tell Datasets to use the placement group for all Datasets tasks. ctx.scheduling_strategy = PlacementGroupSchedulingStrategy(placement_group) # This Dataset workload will use that placement group for all read and map tasks. ds = ray.data.range(100, parallelism=2) \ .map(lambda x: x + 1) assert ds.take_all() == list(range(1, 101)) # __resource_allocation_end__ # fmt: on # fmt: off # __block_move_begin__ import ray from ray.data.context import DatasetContext ctx = DatasetContext.get_current() ctx.optimize_fuse_stages = False
import ray from ray.util.placement_group import ( placement_group, ) from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy # Two "CPU"s are available. ray.init(num_cpus=2) # Create a placement group. pg = placement_group([{"CPU": 2}]) ray.get(pg.ready()) # Now, 2 CPUs are not available anymore because # they are pre-reserved by the placement group. @ray.remote(num_cpus=2) def f(): return True # Won't be scheduled because there are no 2 cpus. f.remote() # Will be scheduled because 2 cpus are reserved by the placement group. f.options(scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)).remote()
def test_worker_exit_intended_system_exit_and_user_error(ray_start_cluster): """ INTENDED_SYSTEM_EXIT - (not tested, hard to test) Unused resource removed - (tested) Pg removed - (tested) Idle USER_ERROR - (tested) Actor init failed """ cluster = ray_start_cluster cluster.add_node(num_cpus=1) ray.init(address=cluster.address) @ray.remote def f(): return ray.get(g.remote()) @ray.remote def g(): return os.getpid() # Start a task that has a blocking call ray.get with g.remote. # g.remote will borrow the CPU and start a new worker. # The worker started for g.remote will exit by IDLE timeout. pid = ray.get(f.remote()) def verify_exit_by_idle_timeout(): worker = get_worker_by_pid(pid) type = worker["exit_type"] detail = worker["exit_detail"] return type == "INTENDED_SYSTEM_EXIT" and "it was idle" in detail wait_for_condition(verify_exit_by_idle_timeout) @ray.remote class A: def getpid(self): return os.getpid() pg = ray.util.placement_group(bundles=[{"CPU": 1}]) a = A.options(scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg)).remote() pid = ray.get(a.getpid.remote()) ray.util.remove_placement_group(pg) def verify_exit_by_pg_removed(): worker = get_worker_by_pid(pid) type = worker["exit_type"] detail = worker["exit_detail"] return (type == "INTENDED_SYSTEM_EXIT" and "placement group was removed" in detail) wait_for_condition(verify_exit_by_pg_removed) @ray.remote class PidDB: def __init__(self): self.pid = None def record_pid(self, pid): self.pid = pid def get_pid(self): return self.pid p = PidDB.remote() @ray.remote class FaultyActor: def __init__(self): p.record_pid.remote(os.getpid()) raise Exception def ready(self): pass a = FaultyActor.remote() wait_for_condition(lambda: ray.get(p.get_pid.remote()) is not None) pid = ray.get(p.get_pid.remote()) def verify_exit_by_actor_init_failure(): worker = get_worker_by_pid(pid) type = worker["exit_type"] detail = worker["exit_detail"] print(type, detail) return (type == "USER_ERROR" and "exception in the initialization method" in detail) wait_for_condition(verify_exit_by_actor_init_failure)
import ray from ray.util.placement_group import placement_group from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy ray.init(num_cpus=4) # Create a placement group with the SPREAD strategy. pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="SPREAD") ray.get(pg.ready()) @ray.remote(num_cpus=1) def child(): pass @ray.remote(num_cpus=1) def parent(): # The child task is scheduled with the same placement group as its parent # although child.options( # scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg) # ).remote() wasn't called if placement_group_capture_child_tasks is set to True. ray.get(child.remote()) ray.get( parent.options(scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg, placement_group_capture_child_tasks=True)).remote())