def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus, max_reconstructions): # Give an error if cls is an old-style class. if not issubclass(cls, object): raise TypeError( "The @ray.remote decorator cannot be applied to old-style " "classes. In Python 2, you must declare the class with " "'class ClassName(object):' instead of 'class ClassName:'.") if issubclass(cls, Checkpointable) and inspect.isabstract(cls): raise TypeError( "A checkpointable actor class should implement all abstract " "methods in the `Checkpointable` interface.") if max_reconstructions is None: max_reconstructions = 0 if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <= ray_constants.INFINITE_RECONSTRUCTION): raise Exception("max_reconstructions must be in range [%d, %d]." % (ray_constants.NO_RECONSTRUCTION, ray_constants.INFINITE_RECONSTRUCTION)) # Modify the class to have an additional method that will be used for # terminating the worker. class Class(cls): def __ray_terminate__(self): worker = ray.worker.get_global_worker() if worker.mode != ray.LOCAL_MODE: # Disconnect the worker from the local scheduler. The point of # this is so that when the worker kills itself below, the local # scheduler won't push an error message to the driver. worker.raylet_client.disconnect() sys.exit(0) assert False, "This process should have terminated." def __ray_checkpoint__(self): """Save a checkpoint. This task saves the current state of the actor, the current task frontier according to the local scheduler, and the checkpoint index (number of tasks executed so far). """ worker = ray.worker.global_worker if not isinstance(self, ray.actor.Checkpointable): raise Exception( "__ray_checkpoint__.remote() may only be called on actors " "that implement ray.actor.Checkpointable") return worker._save_actor_checkpoint() Class.__module__ = cls.__module__ Class.__name__ = cls.__name__ class_id = ActorClassID(_random_string()) return ActorClass(Class, class_id, max_reconstructions, num_cpus, num_gpus, resources, actor_method_cpus)
def _serialization_helper(self, ray_forking): """This is defined in order to make pickling work. Args: ray_forking: True if this is being called because Ray is forking the actor handle and false if it is being called by pickling. Returns: A dictionary of the information needed to reconstruct the object. """ if ray_forking: actor_handle_id = compute_actor_handle_id( self._ray_actor_handle_id, self._ray_actor_forks) else: actor_handle_id = self._ray_actor_handle_id # Note: _ray_actor_cursor and _ray_actor_creation_dummy_object_id # could be None. state = { "actor_id": self._ray_actor_id, "actor_handle_id": actor_handle_id, "module_name": self._ray_module_name, "class_name": self._ray_class_name, "actor_cursor": self._ray_actor_cursor, "actor_method_names": self._ray_actor_method_names, "method_decorators": self._ray_method_decorators, "method_signatures": self._ray_method_signatures, "method_num_return_vals": self._ray_method_num_return_vals, # Actors in local mode don't have dummy objects. "actor_creation_dummy_object_id": self._ray_actor_creation_dummy_object_id, "actor_method_cpus": self._ray_actor_method_cpus, "actor_driver_id": self._ray_actor_driver_id, "ray_forking": ray_forking } if ray_forking: self._ray_actor_forks += 1 new_actor_handle_id = actor_handle_id else: # The execution dependency for a pickled actor handle is never safe # to release, since it could be unpickled and submit another # dependent task at any time. Therefore, we notify the backend of a # random handle ID that will never actually be used. new_actor_handle_id = ActorHandleID(_random_string()) # Notify the backend to expect this new actor handle. The backend will # not release the cursor for any new handles until the first task for # each of the new handles is submitted. # NOTE(swang): There is currently no garbage collection for actor # handles until the actor itself is removed. self._ray_new_actor_handles.append(new_actor_handle_id) return state
def _serialization_helper(self, ray_forking): """This is defined in order to make pickling work. Args: ray_forking: True if this is being called because Ray is forking the actor handle and false if it is being called by pickling. Returns: A dictionary of the information needed to reconstruct the object. """ if ray_forking: actor_handle_id = compute_actor_handle_id( self._ray_actor_handle_id, self._ray_actor_forks) else: actor_handle_id = self._ray_actor_handle_id # Note: _ray_actor_cursor and _ray_actor_creation_dummy_object_id # could be None. state = { "actor_id": self._ray_actor_id, "actor_handle_id": actor_handle_id, "module_name": self._ray_module_name, "class_name": self._ray_class_name, "actor_cursor": self._ray_actor_cursor, "actor_method_names": self._ray_actor_method_names, "method_signatures": self._ray_method_signatures, "method_num_return_vals": self._ray_method_num_return_vals, # Actors in local mode don't have dummy objects. "actor_creation_dummy_object_id": self. _ray_actor_creation_dummy_object_id, "actor_method_cpus": self._ray_actor_method_cpus, "actor_driver_id": self._ray_actor_driver_id, "ray_forking": ray_forking } if ray_forking: self._ray_actor_forks += 1 new_actor_handle_id = actor_handle_id else: # The execution dependency for a pickled actor handle is never safe # to release, since it could be unpickled and submit another # dependent task at any time. Therefore, we notify the backend of a # random handle ID that will never actually be used. new_actor_handle_id = ActorHandleID(_random_string()) # Notify the backend to expect this new actor handle. The backend will # not release the cursor for any new handles until the first task for # each of the new handles is submitted. # NOTE(swang): There is currently no garbage collection for actor # handles until the actor itself is removed. self._ray_new_actor_handles.append(new_actor_handle_id) return state
def test_raylet_crash_when_get(ray_start_regular): nonexistent_id = ray.ObjectID(_random_string()) def sleep_to_kill_raylet(): # Don't kill raylet before default workers get connected. time.sleep(2) ray.worker._global_node.kill_raylet() thread = threading.Thread(target=sleep_to_kill_raylet) thread.start() with pytest.raises(Exception, match=r".*Connection closed unexpectedly.*"): ray.get(nonexistent_id) thread.join()
def test_raylet_crash_when_get(ray_start_regular): nonexistent_id = ray.ObjectID(_random_string()) def sleep_to_kill_raylet(): # Don't kill raylet before default workers get connected. time.sleep(2) ray.services.all_processes[ray.services.PROCESS_TYPE_RAYLET][0].kill() thread = threading.Thread(target=sleep_to_kill_raylet) thread.start() with pytest.raises(Exception, match=r".*raylet client may be closed.*"): ray.get(nonexistent_id) thread.join()
def register_custom_serializer(self, cls, serializer, deserializer, local=False, job_id=None, class_id=None): """Enable serialization and deserialization for a particular class. This method runs the register_class function defined below on every worker, which will enable ray to properly serialize and deserialize objects of this class. Args: cls (type): The class that ray should use this custom serializer for. serializer: The custom serializer to use. deserializer: The custom deserializer to use. local: True if the serializers should only be registered on the current worker. This should usually be False. job_id: ID of the job that we want to register the class for. class_id (str): Unique ID of the class. Autogenerated if None. Raises: RayNotDictionarySerializable: Raised if use_dict is true and cls cannot be efficiently serialized by Ray. ValueError: Raised if ray could not autogenerate a class_id. """ assert serializer is not None and deserializer is not None, ( "Must provide serializer and deserializer.") if class_id is None: if not local: # In this case, the class ID will be used to deduplicate the # class across workers. Note that cloudpickle unfortunately # does not produce deterministic strings, so these IDs could # be different on different workers. We could use something # weaker like cls.__name__, however that would run the risk # of having collisions. # TODO(rkn): We should improve this. try: # Attempt to produce a class ID that will be the same on # each worker. However, determinism is not guaranteed, # and the result may be different on different workers. class_id = _try_to_compute_deterministic_class_id(cls) except Exception: raise ValueError( "Failed to use pickle in generating a unique id" f"for '{cls}'. Provide a unique class_id.") else: # In this case, the class ID only needs to be meaningful on # this worker and not across workers. class_id = _random_string() # Make sure class_id is a string. class_id = ray.utils.binary_to_hex(class_id) if job_id is None: job_id = self.worker.current_job_id assert isinstance(job_id, JobID) def register_class_for_serialization(worker_info): context = worker_info["worker"].get_serialization_context(job_id) context._register_cloudpickle_serializer(cls, serializer, deserializer) if not local: self.worker.run_function_on_all_workers( register_class_for_serialization) else: # Since we are pickling objects of this class, we don't actually # need to ship the class definition. register_class_for_serialization({"worker": self.worker})
def test_driver_exiting_when_worker_blocked(call_ray_start): # This test will create some drivers that submit some tasks and then # exit without waiting for the tasks to complete. redis_address = call_ray_start ray.init(redis_address=redis_address) # Define a driver that creates two tasks, one that runs forever and the # other blocked on the first in a `ray.get`. driver_script = """ import time import ray ray.init(redis_address="{}") @ray.remote def f(): time.sleep(10**6) @ray.remote def g(): ray.get(f.remote()) g.remote() time.sleep(1) print("success") """.format(redis_address) # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): out = run_string_as_driver(driver_script) # Make sure the first driver ran to completion. assert "success" in out # Define a driver that creates two tasks, one that runs forever and the # other blocked on the first in a `ray.wait`. driver_script = """ import time import ray ray.init(redis_address="{}") @ray.remote def f(): time.sleep(10**6) @ray.remote def g(): ray.wait([f.remote()]) g.remote() time.sleep(1) print("success") """.format(redis_address) # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): out = run_string_as_driver(driver_script) # Make sure the first driver ran to completion. assert "success" in out nonexistent_id_bytes = _random_string() nonexistent_id_hex = ray.utils.binary_to_hex(nonexistent_id_bytes) # Define a driver that creates one task that depends on a nonexistent # object. This task will be queued as waiting to execute. driver_script = """ import time import ray ray.init(redis_address="{}") @ray.remote def g(x): return g.remote(ray.ObjectID(ray.utils.hex_to_binary("{}"))) time.sleep(1) print("success") """.format(redis_address, nonexistent_id_hex) # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): out = run_string_as_driver(driver_script) # Simulate the nonexistent dependency becoming available. ray.worker.global_worker.put_object( ray.ObjectID(nonexistent_id_bytes), None) # Make sure the first driver ran to completion. assert "success" in out nonexistent_id_bytes = _random_string() nonexistent_id_hex = ray.utils.binary_to_hex(nonexistent_id_bytes) # Define a driver that calls `ray.wait` on a nonexistent object. driver_script = """ import time import ray ray.init(redis_address="{}") @ray.remote def g(): ray.wait(ray.ObjectID(ray.utils.hex_to_binary("{}"))) g.remote() time.sleep(1) print("success") """.format(redis_address, nonexistent_id_hex) # Create some drivers and let them exit and make sure everything is # still alive. for _ in range(3): out = run_string_as_driver(driver_script) # Simulate the nonexistent dependency becoming available. ray.worker.global_worker.put_object( ray.ObjectID(nonexistent_id_bytes), None) # Make sure the first driver ran to completion. assert "success" in out @ray.remote def f(): return 1 # Make sure we can still talk with the raylet. ray.get(f.remote())
def register_custom_serializer(self, cls, use_pickle=False, use_dict=False, serializer=None, deserializer=None, local=False, job_id=None, class_id=None): """Enable serialization and deserialization for a particular class. This method runs the register_class function defined below on every worker, which will enable ray to properly serialize and deserialize objects of this class. Args: cls (type): The class that ray should use this custom serializer for. use_pickle (bool): If true, then objects of this class will be serialized using pickle. use_dict: If true, then objects of this class be serialized turning their __dict__ fields into a dictionary. Must be False if use_pickle is true. serializer: The custom serializer to use. This should be provided if and only if use_pickle and use_dict are False. deserializer: The custom deserializer to use. This should be provided if and only if use_pickle and use_dict are False. local: True if the serializers should only be registered on the current worker. This should usually be False. job_id: ID of the job that we want to register the class for. class_id (str): Unique ID of the class. Autogenerated if None. Raises: RayNotDictionarySerializable: Raised if use_dict is true and cls cannot be efficiently serialized by Ray. ValueError: Raised if ray could not autogenerate a class_id. """ assert (serializer is None) == (deserializer is None), ( "The serializer/deserializer arguments must both be provided or " "both not be provided.") use_custom_serializer = (serializer is not None) assert use_custom_serializer + use_pickle + use_dict == 1, ( "Exactly one of use_pickle, use_dict, or serializer/deserializer " "must be specified.") if self.worker.use_pickle and serializer is None: # In this case it should do nothing. return if use_dict: # Raise an exception if cls cannot be serialized # efficiently by Ray. check_serializable(cls) if class_id is None: if not local: # In this case, the class ID will be used to deduplicate the # class across workers. Note that cloudpickle unfortunately # does not produce deterministic strings, so these IDs could # be different on different workers. We could use something # weaker like cls.__name__, however that would run the risk # of having collisions. # TODO(rkn): We should improve this. try: # Attempt to produce a class ID that will be the same on # each worker. However, determinism is not guaranteed, # and the result may be different on different workers. class_id = _try_to_compute_deterministic_class_id(cls) except Exception: raise ValueError( "Failed to use pickle in generating a unique id" "for '{}'. Provide a unique class_id.".format(cls)) else: # In this case, the class ID only needs to be meaningful on # this worker and not across workers. class_id = _random_string() # Make sure class_id is a string. class_id = ray.utils.binary_to_hex(class_id) if job_id is None: job_id = self.worker.current_job_id assert isinstance(job_id, JobID) def register_class_for_serialization(worker_info): context = worker_info["worker"].get_serialization_context(job_id) if worker_info["worker"].use_pickle: context._register_cloudpickle_serializer( cls, serializer, deserializer) else: # TODO(rkn): We need to be more thoughtful about what to do if # custom serializers have already been registered for # class_id. In some cases, we may want to use the last # user-defined serializers and ignore subsequent calls to # register_custom_serializer that were made by the system. context.pyarrow_context.register_type( cls, class_id, pickle=use_pickle, custom_serializer=serializer, custom_deserializer=deserializer) if not local: self.worker.run_function_on_all_workers( register_class_for_serialization) else: # Since we are pickling objects of this class, we don't actually # need to ship the class definition. register_class_for_serialization({"worker": self.worker})
def _submit(self, args, kwargs, num_cpus=None, num_gpus=None, resources=None): """Create an actor. This method allows more flexibility than the remote method because resource requirements can be specified and override the defaults in the decorator. Args: args: The arguments to forward to the actor constructor. kwargs: The keyword arguments to forward to the actor constructor. num_cpus: The number of CPUs required by the actor creation task. num_gpus: The number of GPUs required by the actor creation task. resources: The custom resources required by the actor creation task. Returns: A handle to the newly created actor. """ if ray.worker.global_worker.mode is None: raise Exception("Actors cannot be created before ray.init() " "has been called.") actor_id = ray.local_scheduler.ObjectID(_random_string()) # The actor cursor is a dummy object representing the most recent # actor method invocation. For each subsequent method invocation, # the current cursor should be added as a dependency, and then # updated to reflect the new invocation. actor_cursor = None # Get the actor methods of the given class. def pred(x): return (inspect.isfunction(x) or inspect.ismethod(x) or is_cython(x)) actor_methods = inspect.getmembers(self._modified_class, predicate=pred) # Extract the signatures of each of the methods. This will be used # to catch some errors if the methods are called with inappropriate # arguments. method_signatures = dict() for k, v in actor_methods: # Print a warning message if the method signature is not # supported. We don't raise an exception because if the actor # inherits from a class that has a method whose signature we # don't support, there may not be much the user can do about it. signature.check_signature_supported(v, warn=True) method_signatures[k] = signature.extract_signature( v, ignore_first=True) actor_method_names = [method_name for method_name, _ in actor_methods] actor_method_num_return_vals = [] for _, method in actor_methods: if hasattr(method, "__ray_num_return_vals__"): actor_method_num_return_vals.append( method.__ray_num_return_vals__) else: actor_method_num_return_vals.append(1) # Do not export the actor class or the actor if run in PYTHON_MODE # Instead, instantiate the actor locally and add it to # global_worker's dictionary if ray.worker.global_worker.mode == ray.PYTHON_MODE: ray.worker.global_worker.actors[actor_id] = ( self._modified_class.__new__(self._modified_class)) else: # Export the actor. if not self._exported: export_actor_class(self._class_id, self._modified_class, actor_method_names, actor_method_num_return_vals, self._checkpoint_interval, ray.worker.global_worker) self._exported = True actor_cursor = export_actor(actor_id, self._class_id, self._class_name, actor_method_names, actor_method_num_return_vals, self._actor_creation_resources, self._actor_method_cpus, ray.worker.global_worker) # We initialize the actor counter at 1 to account for the actor # creation task. actor_counter = 1 actor_handle = ActorHandle(actor_id, self._class_name, actor_cursor, actor_counter, actor_method_names, actor_method_num_return_vals, method_signatures, actor_cursor, self._actor_method_cpus, ray.worker.global_worker.task_driver_id) # Call __init__ as a remote function. if "__init__" in actor_handle._ray_actor_method_names: actor_handle.__init__.remote(*args, **kwargs) else: if len(args) != 0 or len(kwargs) != 0: raise Exception("Arguments cannot be passed to the actor " "constructor because this actor class has no " "__init__ method.") return actor_handle
def remote(cls, *args, **kwargs): if ray.worker.global_worker.mode is None: raise Exception("Actors cannot be created before ray.init() " "has been called.") actor_id = ray.local_scheduler.ObjectID(_random_string()) # The ID for this instance of ActorHandle. These should be unique # across instances with the same _ray_actor_id. actor_handle_id = ray.local_scheduler.ObjectID( ray.worker.NIL_ACTOR_ID) # The actor cursor is a dummy object representing the most recent # actor method invocation. For each subsequent method invocation, # the current cursor should be added as a dependency, and then # updated to reflect the new invocation. actor_cursor = None # The number of actor method invocations that we've called so far. actor_counter = 0 # Get the actor methods of the given class. actor_methods = inspect.getmembers( Class, predicate=(lambda x: (inspect.isfunction(x) or inspect. ismethod(x) or is_cython(x)))) # Extract the signatures of each of the methods. This will be used # to catch some errors if the methods are called with inappropriate # arguments. method_signatures = dict() for k, v in actor_methods: # Print a warning message if the method signature is not # supported. We don't raise an exception because if the actor # inherits from a class that has a method whose signature we # don't support, we there may not be much the user can do about # it. signature.check_signature_supported(v, warn=True) method_signatures[k] = signature.extract_signature( v, ignore_first=True) actor_method_names = [ method_name for method_name, _ in actor_methods ] actor_method_num_return_vals = [] for _, method in actor_methods: if hasattr(method, "__ray_num_return_vals__"): actor_method_num_return_vals.append( method.__ray_num_return_vals__) else: actor_method_num_return_vals.append(1) # Do not export the actor class or the actor if run in PYTHON_MODE # Instead, instantiate the actor locally and add it to # global_worker's dictionary if ray.worker.global_worker.mode == ray.PYTHON_MODE: ray.worker.global_worker.actors[actor_id] = ( Class.__new__(Class)) else: # Export the actor. if not exported: export_actor_class(class_id, Class, actor_method_names, actor_method_num_return_vals, checkpoint_interval, ray.worker.global_worker) exported.append(0) actor_cursor = export_actor(actor_id, class_id, class_name, actor_method_names, actor_method_num_return_vals, actor_creation_resources, actor_method_cpus, ray.worker.global_worker) # Instantiate the actor handle. actor_object = cls.__new__(cls) actor_object._manual_init(actor_id, class_id, actor_handle_id, actor_cursor, actor_counter, actor_method_names, actor_method_num_return_vals, method_signatures, checkpoint_interval, actor_cursor, actor_creation_resources, actor_method_cpus) # Call __init__ as a remote function. if "__init__" in actor_object._ray_actor_method_names: actor_object._actor_method_call("__init__", args=args, kwargs=kwargs, dependency=actor_cursor) else: print("WARNING: this object has no __init__ method.") return actor_object
def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus, checkpoint_interval, max_reconstructions): # Give an error if cls is an old-style class. if not issubclass(cls, object): raise TypeError( "The @ray.remote decorator cannot be applied to old-style " "classes. In Python 2, you must declare the class with " "'class ClassName(object):' instead of 'class ClassName:'.") if checkpoint_interval is None: checkpoint_interval = -1 if max_reconstructions is None: max_reconstructions = 0 if checkpoint_interval == 0: raise Exception("checkpoint_interval must be greater than 0.") if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <= ray_constants.INFINITE_RECONSTRUCTION): raise Exception("max_reconstructions must be in range [%d, %d]." % (ray_constants.NO_RECONSTRUCTION, ray_constants.INFINITE_RECONSTRUCTION)) # Modify the class to have an additional method that will be used for # terminating the worker. class Class(cls): def __ray_terminate__(self): worker = ray.worker.get_global_worker() if worker.mode != ray.LOCAL_MODE: # Disconnect the worker from the local scheduler. The point of # this is so that when the worker kills itself below, the local # scheduler won't push an error message to the driver. worker.raylet_client.disconnect() sys.exit(0) assert False, "This process should have terminated." def __ray_save_checkpoint__(self): if hasattr(self, "__ray_save__"): object_to_serialize = self.__ray_save__() else: object_to_serialize = self return pickle.dumps(object_to_serialize) @classmethod def __ray_restore_from_checkpoint__(cls, pickled_checkpoint): checkpoint = pickle.loads(pickled_checkpoint) if hasattr(cls, "__ray_restore__"): actor_object = cls.__new__(cls) actor_object.__ray_restore__(checkpoint) else: # TODO(rkn): It's possible that this will cause problems. When # you unpickle the same object twice, the two objects will not # have the same class. actor_object = checkpoint return actor_object def __ray_checkpoint__(self): """Save a checkpoint. This task saves the current state of the actor, the current task frontier according to the local scheduler, and the checkpoint index (number of tasks executed so far). """ worker = ray.worker.global_worker checkpoint_index = worker.actor_task_counter # Get the state to save. checkpoint = self.__ray_save_checkpoint__() # Get the current task frontier, per actor handle. # NOTE(swang): This only includes actor handles that the local # scheduler has seen. Handle IDs for which no task has yet reached # the local scheduler will not be included, and may not be runnable # on checkpoint resumption. actor_id = worker.actor_id frontier = worker.raylet_client.get_actor_frontier(actor_id) # Save the checkpoint in Redis. TODO(rkn): Checkpoints # should not be stored in Redis. Fix this. set_actor_checkpoint(worker, worker.actor_id, checkpoint_index, checkpoint, frontier) def __ray_checkpoint_restore__(self): """Restore a checkpoint. This task looks for a saved checkpoint and if found, restores the state of the actor, the task frontier in the local scheduler, and the checkpoint index (number of tasks executed so far). Returns: A bool indicating whether a checkpoint was resumed. """ worker = ray.worker.global_worker # Get the most recent checkpoint stored, if any. checkpoint_index, checkpoint, frontier = get_actor_checkpoint( worker, worker.actor_id) # Try to resume from the checkpoint. checkpoint_resumed = False if checkpoint_index is not None: # Load the actor state from the checkpoint. worker.actors[worker.actor_id] = ( worker.actor_class.__ray_restore_from_checkpoint__( checkpoint)) # Set the number of tasks executed so far. worker.actor_task_counter = checkpoint_index # Set the actor frontier in the local scheduler. worker.raylet_client.set_actor_frontier(frontier) checkpoint_resumed = True return checkpoint_resumed Class.__module__ = cls.__module__ Class.__name__ = cls.__name__ class_id = ActorClassID(_random_string()) return ActorClass(Class, class_id, checkpoint_interval, max_reconstructions, num_cpus, num_gpus, resources, actor_method_cpus)
def _remote(self, args=None, kwargs=None, num_cpus=None, num_gpus=None, resources=None): """Create an actor. This method allows more flexibility than the remote method because resource requirements can be specified and override the defaults in the decorator. Args: args: The arguments to forward to the actor constructor. kwargs: The keyword arguments to forward to the actor constructor. num_cpus: The number of CPUs required by the actor creation task. num_gpus: The number of GPUs required by the actor creation task. resources: The custom resources required by the actor creation task. Returns: A handle to the newly created actor. """ if args is None: args = [] if kwargs is None: kwargs = {} worker = ray.worker.get_global_worker() if worker.mode is None: raise Exception("Actors cannot be created before ray.init() " "has been called.") actor_id = ActorID(_random_string()) # The actor cursor is a dummy object representing the most recent # actor method invocation. For each subsequent method invocation, # the current cursor should be added as a dependency, and then # updated to reflect the new invocation. actor_cursor = None # Do not export the actor class or the actor if run in LOCAL_MODE # Instead, instantiate the actor locally and add it to the worker's # dictionary if worker.mode == ray.LOCAL_MODE: worker.actors[actor_id] = self._modified_class( *copy.deepcopy(args), **copy.deepcopy(kwargs)) else: # Export the actor. if not self._exported: worker.function_actor_manager.export_actor_class( self._modified_class, self._actor_method_names) self._exported = True resources = ray.utils.resources_from_resource_arguments( self._num_cpus, self._num_gpus, self._resources, num_cpus, num_gpus, resources) # If the actor methods require CPU resources, then set the required # placement resources. If actor_placement_resources is empty, then # the required placement resources will be the same as resources. actor_placement_resources = {} assert self._actor_method_cpus in [0, 1] if self._actor_method_cpus == 1: actor_placement_resources = resources.copy() actor_placement_resources["CPU"] += 1 function_name = "__init__" function_signature = self._method_signatures[function_name] creation_args = signature.extend_args(function_signature, args, kwargs) function_descriptor = FunctionDescriptor( self._modified_class.__module__, function_name, self._modified_class.__name__) [actor_cursor] = worker.submit_task( function_descriptor, creation_args, actor_creation_id=actor_id, max_actor_reconstructions=self._max_reconstructions, num_return_vals=1, resources=resources, placement_resources=actor_placement_resources) assert isinstance(actor_cursor, ObjectID) actor_handle = ActorHandle( actor_id, self._modified_class.__module__, self._class_name, actor_cursor, self._actor_method_names, self._method_signatures, self._actor_method_num_return_vals, actor_cursor, self._actor_method_cpus, worker.task_driver_id) # We increment the actor counter by 1 to account for the actor creation # task. actor_handle._ray_actor_counter += 1 return actor_handle
def _remote(self, args, kwargs, num_cpus=None, num_gpus=None, resources=None): """Create an actor. This method allows more flexibility than the remote method because resource requirements can be specified and override the defaults in the decorator. Args: args: The arguments to forward to the actor constructor. kwargs: The keyword arguments to forward to the actor constructor. num_cpus: The number of CPUs required by the actor creation task. num_gpus: The number of GPUs required by the actor creation task. resources: The custom resources required by the actor creation task. Returns: A handle to the newly created actor. """ worker = ray.worker.get_global_worker() if worker.mode is None: raise Exception("Actors cannot be created before ray.init() " "has been called.") actor_id = ray.ObjectID(_random_string()) # The actor cursor is a dummy object representing the most recent # actor method invocation. For each subsequent method invocation, # the current cursor should be added as a dependency, and then # updated to reflect the new invocation. actor_cursor = None # Do not export the actor class or the actor if run in LOCAL_MODE # Instead, instantiate the actor locally and add it to the worker's # dictionary if worker.mode == ray.LOCAL_MODE: worker.actors[actor_id] = self._modified_class.__new__( self._modified_class) else: # Export the actor. if not self._exported: worker.function_actor_manager.export_actor_class( self._class_id, self._modified_class, self._actor_method_names, self._checkpoint_interval) self._exported = True resources = ray.utils.resources_from_resource_arguments( self._num_cpus, self._num_gpus, self._resources, num_cpus, num_gpus, resources) # If the actor methods require CPU resources, then set the required # placement resources. If actor_placement_resources is empty, then # the required placement resources will be the same as resources. actor_placement_resources = {} assert self._actor_method_cpus in [0, 1] if self._actor_method_cpus == 1: actor_placement_resources = resources.copy() actor_placement_resources["CPU"] += 1 creation_args = [self._class_id] function_id = compute_actor_creation_function_id(self._class_id) [actor_cursor] = worker.submit_task( function_id, creation_args, actor_creation_id=actor_id, num_return_vals=1, resources=resources, placement_resources=actor_placement_resources) # We initialize the actor counter at 1 to account for the actor # creation task. actor_counter = 1 actor_handle = ActorHandle( actor_id, self._class_name, actor_cursor, actor_counter, self._actor_method_names, self._method_signatures, self._actor_method_num_return_vals, actor_cursor, self._actor_method_cpus, worker.task_driver_id) # Call __init__ as a remote function. if "__init__" in actor_handle._ray_actor_method_names: actor_handle.__init__.remote(*args, **kwargs) else: if len(args) != 0 or len(kwargs) != 0: raise Exception("Arguments cannot be passed to the actor " "constructor because this actor class has no " "__init__ method.") return actor_handle
def generate_id(cls): return binary_to_hex(_random_string())[:8]
def make_actor(cls, resources, checkpoint_interval, actor_method_cpus): if checkpoint_interval == 0: raise Exception("checkpoint_interval must be greater than 0.") # Modify the class to have an additional method that will be used for # terminating the worker. class Class(cls): def __ray_terminate__(self): # Disconnect the worker from the local scheduler. The point of this # is so that when the worker kills itself below, the local # scheduler won't push an error message to the driver. ray.worker.global_worker.local_scheduler_client.disconnect() import os os._exit(0) def __ray_save_checkpoint__(self): if hasattr(self, "__ray_save__"): object_to_serialize = self.__ray_save__() else: object_to_serialize = self return pickle.dumps(object_to_serialize) @classmethod def __ray_restore_from_checkpoint__(cls, pickled_checkpoint): checkpoint = pickle.loads(pickled_checkpoint) if hasattr(cls, "__ray_restore__"): actor_object = cls.__new__(cls) actor_object.__ray_restore__(checkpoint) else: # TODO(rkn): It's possible that this will cause problems. When # you unpickle the same object twice, the two objects will not # have the same class. actor_object = checkpoint return actor_object def __ray_checkpoint__(self): """Save a checkpoint. This task saves the current state of the actor, the current task frontier according to the local scheduler, and the checkpoint index (number of tasks executed so far). """ worker = ray.worker.global_worker checkpoint_index = worker.actor_task_counter # Get the state to save. checkpoint = self.__ray_save_checkpoint__() # Get the current task frontier, per actor handle. # NOTE(swang): This only includes actor handles that the local # scheduler has seen. Handle IDs for which no task has yet reached # the local scheduler will not be included, and may not be runnable # on checkpoint resumption. actor_id = ray.local_scheduler.ObjectID(worker.actor_id) frontier = worker.local_scheduler_client.get_actor_frontier( actor_id) # Save the checkpoint in Redis. TODO(rkn): Checkpoints # should not be stored in Redis. Fix this. set_actor_checkpoint(worker, worker.actor_id, checkpoint_index, checkpoint, frontier) def __ray_checkpoint_restore__(self): """Restore a checkpoint. This task looks for a saved checkpoint and if found, restores the state of the actor, the task frontier in the local scheduler, and the checkpoint index (number of tasks executed so far). Returns: A bool indicating whether a checkpoint was resumed. """ worker = ray.worker.global_worker # Get the most recent checkpoint stored, if any. checkpoint_index, checkpoint, frontier = get_actor_checkpoint( worker, worker.actor_id) # Try to resume from the checkpoint. checkpoint_resumed = False if checkpoint_index is not None: # Load the actor state from the checkpoint. worker.actors[worker.actor_id] = ( worker.actor_class.__ray_restore_from_checkpoint__( checkpoint)) # Set the number of tasks executed so far. worker.actor_task_counter = checkpoint_index # Set the actor frontier in the local scheduler. worker.local_scheduler_client.set_actor_frontier(frontier) checkpoint_resumed = True return checkpoint_resumed Class.__module__ = cls.__module__ Class.__name__ = cls.__name__ class_id = _random_string() return ActorClass(Class, class_id, checkpoint_interval, resources, actor_method_cpus)
def _remote(self, args, kwargs, num_cpus=None, num_gpus=None, resources=None): """Create an actor. This method allows more flexibility than the remote method because resource requirements can be specified and override the defaults in the decorator. Args: args: The arguments to forward to the actor constructor. kwargs: The keyword arguments to forward to the actor constructor. num_cpus: The number of CPUs required by the actor creation task. num_gpus: The number of GPUs required by the actor creation task. resources: The custom resources required by the actor creation task. Returns: A handle to the newly created actor. """ worker = ray.worker.get_global_worker() if worker.mode is None: raise Exception("Actors cannot be created before ray.init() " "has been called.") actor_id = ActorID(_random_string()) # The actor cursor is a dummy object representing the most recent # actor method invocation. For each subsequent method invocation, # the current cursor should be added as a dependency, and then # updated to reflect the new invocation. actor_cursor = None # Do not export the actor class or the actor if run in LOCAL_MODE # Instead, instantiate the actor locally and add it to the worker's # dictionary if worker.mode == ray.LOCAL_MODE: worker.actors[actor_id] = self._modified_class( *copy.deepcopy(args), **copy.deepcopy(kwargs)) else: # Export the actor. if not self._exported: worker.function_actor_manager.export_actor_class( self._modified_class, self._actor_method_names) self._exported = True resources = ray.utils.resources_from_resource_arguments( self._num_cpus, self._num_gpus, self._resources, num_cpus, num_gpus, resources) # If the actor methods require CPU resources, then set the required # placement resources. If actor_placement_resources is empty, then # the required placement resources will be the same as resources. actor_placement_resources = {} assert self._actor_method_cpus in [0, 1] if self._actor_method_cpus == 1: actor_placement_resources = resources.copy() actor_placement_resources["CPU"] += 1 if args is None: args = [] if kwargs is None: kwargs = {} function_name = "__init__" function_signature = self._method_signatures[function_name] creation_args = signature.extend_args(function_signature, args, kwargs) function_descriptor = FunctionDescriptor( self._modified_class.__module__, function_name, self._modified_class.__name__) [actor_cursor] = worker.submit_task( function_descriptor, creation_args, actor_creation_id=actor_id, max_actor_reconstructions=self._max_reconstructions, num_return_vals=1, resources=resources, placement_resources=actor_placement_resources) assert isinstance(actor_cursor, ObjectID) actor_handle = ActorHandle( actor_id, self._modified_class.__module__, self._class_name, actor_cursor, self._actor_method_names, self._method_signatures, self._actor_method_num_return_vals, actor_cursor, self._actor_method_cpus, worker.task_driver_id) # We increment the actor counter by 1 to account for the actor creation # task. actor_handle._ray_actor_counter += 1 return actor_handle