def __init__(self, function, num_cpus, num_gpus, memory, object_store_memory, resources, num_return_vals, max_calls, max_retries): self._function = function self._function_name = (self._function.__module__ + "." + self._function.__name__) self._num_cpus = (DEFAULT_REMOTE_FUNCTION_CPUS if num_cpus is None else num_cpus) self._num_gpus = num_gpus self._memory = memory if object_store_memory is not None: raise NotImplementedError( "setting object_store_memory is not implemented for tasks") self._object_store_memory = None self._resources = resources self._num_return_vals = (DEFAULT_REMOTE_FUNCTION_NUM_RETURN_VALS if num_return_vals is None else num_return_vals) self._max_calls = (DEFAULT_REMOTE_FUNCTION_MAX_CALLS if max_calls is None else max_calls) self._max_retries = (DEFAULT_REMOTE_FUNCTION_NUM_TASK_RETRIES if max_retries is None else max_retries) self._decorator = getattr(function, "__ray_invocation_decorator__", None) self._function_signature = ray.signature.extract_signature( self._function) self._last_export_session_and_job = None # Override task.remote's signature and docstring @wraps(function) def _remote_proxy(*args, **kwargs): return self._remote(args=args, kwargs=kwargs) self.remote = _remote_proxy self.direct_call_enabled = ray_constants.direct_call_enabled()
def make_actor(cls, num_cpus, num_gpus, memory, object_store_memory, resources, max_reconstructions): # Give an error if cls is an old-style class. if not issubclass(cls, object): raise TypeError( "The @ray.remote decorator cannot be applied to old-style " "classes. In Python 2, you must declare the class with " "'class ClassName(object):' instead of 'class ClassName:'.") if issubclass(cls, Checkpointable) and inspect.isabstract(cls): raise TypeError( "A checkpointable actor class should implement all abstract " "methods in the `Checkpointable` interface.") if max_reconstructions is None: if ray_constants.direct_call_enabled(): # Allow the actor creation task to be resubmitted automatically # by default. max_reconstructions = 3 else: max_reconstructions = 0 if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <= ray_constants.INFINITE_RECONSTRUCTION): raise Exception("max_reconstructions must be in range [%d, %d]." % (ray_constants.NO_RECONSTRUCTION, ray_constants.INFINITE_RECONSTRUCTION)) # Modify the class to have an additional method that will be used for # terminating the worker. class Class(cls): def __ray_terminate__(self): worker = ray.worker.get_global_worker() if worker.mode != ray.LOCAL_MODE: ray.actor.exit_actor() def __ray_checkpoint__(self): """Save a checkpoint. This task saves the current state of the actor, the current task frontier according to the raylet, and the checkpoint index (number of tasks executed so far). """ worker = ray.worker.global_worker if not isinstance(self, ray.actor.Checkpointable): raise Exception( "__ray_checkpoint__.remote() may only be called on actors " "that implement ray.actor.Checkpointable") return worker._save_actor_checkpoint() Class.__module__ = cls.__module__ Class.__name__ = cls.__name__ return ActorClass._ray_from_modified_class(Class, ActorClassID.from_random(), max_reconstructions, num_cpus, num_gpus, memory, object_store_memory, resources)
def testLineageEvictedReconstructionFails(self): if ray_constants.direct_call_enabled(): return # not relevant @ray.remote def f(data): return 0 x_id = f.remote(None) ray.get(x_id) # Hold references to the ray.put objects so they aren't LRU'd. oids = [] for _ in range(400): new_oids = [f.remote(np.zeros(10000)) for _ in range(50)] oids.extend(new_oids) ray.get(new_oids) self.assertRaises(ray.exceptions.UnreconstructableError, lambda: ray.get(x_id))
def _remote(self, args=None, kwargs=None, num_cpus=None, num_gpus=None, memory=None, object_store_memory=None, resources=None, is_direct_call=None, max_concurrency=None, name=None, detached=False, is_asyncio=False): """Create an actor. This method allows more flexibility than the remote method because resource requirements can be specified and override the defaults in the decorator. Args: args: The arguments to forward to the actor constructor. kwargs: The keyword arguments to forward to the actor constructor. num_cpus: The number of CPUs required by the actor creation task. num_gpus: The number of GPUs required by the actor creation task. memory: Restrict the heap memory usage of this actor. object_store_memory: Restrict the object store memory used by this actor when creating objects. resources: The custom resources required by the actor creation task. is_direct_call: Use direct actor calls. max_concurrency: The max number of concurrent calls to allow for this actor. This only works with direct actor calls. The max concurrency defaults to 1 for threaded execution, and 100 for asyncio execution. Note that the execution order is not guaranteed when max_concurrency > 1. name: The globally unique name for the actor. detached: Whether the actor should be kept alive after driver exits. is_asyncio: Turn on async actor calls. This only works with direct actor calls. Returns: A handle to the newly created actor. """ if args is None: args = [] if kwargs is None: kwargs = {} if is_direct_call is None: is_direct_call = ray_constants.direct_call_enabled() if max_concurrency is None: if is_asyncio: max_concurrency = 100 else: max_concurrency = 1 if max_concurrency > 1 and not is_direct_call: raise ValueError( "setting max_concurrency requires is_direct_call=True") if max_concurrency < 1: raise ValueError("max_concurrency must be >= 1") if is_asyncio and not is_direct_call: raise ValueError( "Setting is_asyncio requires is_direct_call=True.") worker = ray.worker.get_global_worker() if worker.mode is None: raise Exception("Actors cannot be created before ray.init() " "has been called.") meta = self.__ray_metadata__ if detached and name is None: raise Exception("Detached actors must be named. " "Please use Actor._remote(name='some_name') " "to associate the name.") # Check whether the name is already taken. if name is not None: try: ray.experimental.get_actor(name) except ValueError: # name is not taken, expected. pass else: raise ValueError( "The name {name} is already taken. Please use " "a different name or get existing actor using " "ray.experimental.get_actor('{name}')".format(name=name)) # Set the actor's default resources if not already set. First three # conditions are to check that no resources were specified in the # decorator. Last three conditions are to check that no resources were # specified when _remote() was called. if (meta.num_cpus is None and meta.num_gpus is None and meta.resources is None and num_cpus is None and num_gpus is None and resources is None): # In the default case, actors acquire no resources for # their lifetime, and actor methods will require 1 CPU. cpus_to_use = ray_constants.DEFAULT_ACTOR_CREATION_CPU_SIMPLE actor_method_cpu = ray_constants.DEFAULT_ACTOR_METHOD_CPU_SIMPLE else: # If any resources are specified (here or in decorator), then # all resources are acquired for the actor's lifetime and no # resources are associated with methods. cpus_to_use = (ray_constants.DEFAULT_ACTOR_CREATION_CPU_SPECIFIED if meta.num_cpus is None else meta.num_cpus) actor_method_cpu = ray_constants.DEFAULT_ACTOR_METHOD_CPU_SPECIFIED function_name = "__init__" function_descriptor = FunctionDescriptor( meta.modified_class.__module__, function_name, meta.modified_class.__name__) # Do not export the actor class or the actor if run in LOCAL_MODE # Instead, instantiate the actor locally and add it to the worker's # dictionary if worker.mode == ray.LOCAL_MODE: actor_id = ActorID.from_random() worker.actors[actor_id] = meta.modified_class( *copy.deepcopy(args), **copy.deepcopy(kwargs)) else: # Export the actor. if (meta.last_export_session_and_job != worker.current_session_and_job): # If this actor class was not exported in this session and job, # we need to export this function again, because current GCS # doesn't have it. meta.last_export_session_and_job = ( worker.current_session_and_job) worker.function_actor_manager.export_actor_class( meta.modified_class, meta.actor_method_names) resources = ray.utils.resources_from_resource_arguments( cpus_to_use, meta.num_gpus, meta.memory, meta.object_store_memory, meta.resources, num_cpus, num_gpus, memory, object_store_memory, resources) # If the actor methods require CPU resources, then set the required # placement resources. If actor_placement_resources is empty, then # the required placement resources will be the same as resources. actor_placement_resources = {} assert actor_method_cpu in [0, 1] if actor_method_cpu == 1: actor_placement_resources = resources.copy() actor_placement_resources["CPU"] += 1 function_signature = meta.method_signatures[function_name] creation_args = signature.flatten_args(function_signature, args, kwargs) actor_id = worker.core_worker.create_actor( function_descriptor.get_function_descriptor_list(), creation_args, meta.max_reconstructions, resources, actor_placement_resources, is_direct_call, max_concurrency, detached, is_asyncio) actor_handle = ActorHandle( actor_id, meta.modified_class.__module__, meta.class_name, meta.actor_method_names, meta.method_decorators, meta.method_signatures, meta.actor_method_num_return_vals, actor_method_cpu, worker.current_session_and_job, original_handle=True) if name is not None: ray.experimental.register_actor(name, actor_handle) return actor_handle
# tasks. tasks = [f.remote() for _ in range(10)] start = time.time() ray.get(tasks) end = time.time() # Submit some more tasks that can only be executed on the remote nodes. tasks = [f.remote() for _ in range(10)] # Sleep for a bit to let the tasks finish. time.sleep((end - start) * 2) _, unready = ray.wait(tasks, num_returns=len(tasks), timeout=0) # All remote tasks should have finished. assert len(unready) == 0 @pytest.mark.skipif(ray_constants.direct_call_enabled(), reason="TODO(ekl)") def test_object_transfer_dump(ray_start_cluster): cluster = ray_start_cluster num_nodes = 3 for i in range(num_nodes): cluster.add_node(resources={str(i): 1}, object_store_memory=10**9) ray.init(address=cluster.address) @ray.remote def f(x): return # These objects will live on different nodes. object_ids = [ f._remote(args=[1], resources={str(i): 1}) for i in range(num_nodes)
"class_name": tune.grid_search(["a"]), "config": {{"lr": tune.grid_search([1, 2])}} }}, }}, "local_dir": os.path.expanduser("~/tmp") }} }}) print("success") """.format(address_info["redis_address"]) for i in range(2): out = run_string_as_driver(driver_script) assert "success" in out @pytest.mark.skipif(ray_constants.direct_call_enabled(), reason="fate sharing not implemented yet") def test_driver_exiting_when_worker_blocked(call_ray_start): # This test will create some drivers that submit some tasks and then # exit without waiting for the tasks to complete. address = call_ray_start ray.init(address=address) # Define a driver that creates two tasks, one that runs forever and the # other blocked on the first in a `ray.get`. driver_script = """ import time import ray ray.init(address="{}") @ray.remote
import json import os import signal import sys import time import pytest import ray import ray.ray_constants as ray_constants from ray.cluster_utils import Cluster from ray.test_utils import RayTestTimeoutException RAY_FORCE_DIRECT = ray_constants.direct_call_enabled() @pytest.fixture(params=[(1, 4), (4, 4)]) def ray_start_workers_separate_multinode(request): num_nodes = request.param[0] num_initial_workers = request.param[1] # Start the Ray processes. cluster = Cluster() for _ in range(num_nodes): cluster.add_node(num_cpus=num_initial_workers) ray.init(address=cluster.address) yield num_nodes, num_initial_workers # The code after the yield will run as teardown code. ray.shutdown() cluster.shutdown()
actor_info, = actor_table.values() assert actor_info["JobID"] == job_id.hex() assert "IPAddress" in actor_info["Address"] assert "IPAddress" in actor_info["OwnerAddress"] assert actor_info["Address"]["Port"] != actor_info["OwnerAddress"]["Port"] job_table = ray.jobs() assert len(job_table) == 1 assert job_table[0]["JobID"] == job_id.hex() assert job_table[0]["NodeManagerAddress"] == node_ip_address @pytest.mark.skipif( ray_constants.direct_call_enabled(), reason="object and task API not supported") def test_global_state_task_object_api(shutdown_only): ray.init() job_id = ray.utils.compute_job_id_from_driver( ray.WorkerID(ray.worker.global_worker.worker_id)) driver_task_id = ray.worker.global_worker.current_task_id.hex() nil_actor_id_hex = ray.ActorID.nil().hex() @ray.remote def f(*xs): return 1 x_id = ray.put(1)