Exemplo n.º 1
0
def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus,
               max_reconstructions):
    # Give an error if cls is an old-style class.
    if not issubclass(cls, object):
        raise TypeError(
            "The @ray.remote decorator cannot be applied to old-style "
            "classes. In Python 2, you must declare the class with "
            "'class ClassName(object):' instead of 'class ClassName:'.")

    if issubclass(cls, Checkpointable) and inspect.isabstract(cls):
        raise TypeError(
            "A checkpointable actor class should implement all abstract "
            "methods in the `Checkpointable` interface.")

    if max_reconstructions is None:
        max_reconstructions = 0

    if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <=
            ray_constants.INFINITE_RECONSTRUCTION):
        raise Exception("max_reconstructions must be in range [%d, %d]." %
                        (ray_constants.NO_RECONSTRUCTION,
                         ray_constants.INFINITE_RECONSTRUCTION))

    # Modify the class to have an additional method that will be used for
    # terminating the worker.
    class Class(cls):
        def __ray_terminate__(self):
            worker = ray.worker.get_global_worker()
            if worker.mode != ray.LOCAL_MODE:
                # Disconnect the worker from the local scheduler. The point of
                # this is so that when the worker kills itself below, the local
                # scheduler won't push an error message to the driver.
                worker.raylet_client.disconnect()
                sys.exit(0)
                assert False, "This process should have terminated."

        def __ray_checkpoint__(self):
            """Save a checkpoint.

            This task saves the current state of the actor, the current task
            frontier according to the local scheduler, and the checkpoint index
            (number of tasks executed so far).
            """
            worker = ray.worker.global_worker
            if not isinstance(self, ray.actor.Checkpointable):
                raise Exception(
                    "__ray_checkpoint__.remote() may only be called on actors "
                    "that implement ray.actor.Checkpointable")
            return worker._save_actor_checkpoint()

    Class.__module__ = cls.__module__
    Class.__name__ = cls.__name__

    class_id = ActorClassID(_random_string())

    return ActorClass(Class, class_id, max_reconstructions, num_cpus, num_gpus,
                      resources, actor_method_cpus)
Exemplo n.º 2
0
def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus,
               max_reconstructions):
    # Give an error if cls is an old-style class.
    if not issubclass(cls, object):
        raise TypeError(
            "The @ray.remote decorator cannot be applied to old-style "
            "classes. In Python 2, you must declare the class with "
            "'class ClassName(object):' instead of 'class ClassName:'.")

    if issubclass(cls, Checkpointable) and inspect.isabstract(cls):
        raise TypeError(
            "A checkpointable actor class should implement all abstract "
            "methods in the `Checkpointable` interface.")

    if max_reconstructions is None:
        max_reconstructions = 0

    if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <=
            ray_constants.INFINITE_RECONSTRUCTION):
        raise Exception("max_reconstructions must be in range [%d, %d]." %
                        (ray_constants.NO_RECONSTRUCTION,
                         ray_constants.INFINITE_RECONSTRUCTION))

    # Modify the class to have an additional method that will be used for
    # terminating the worker.
    class Class(cls):
        def __ray_terminate__(self):
            worker = ray.worker.get_global_worker()
            if worker.mode != ray.LOCAL_MODE:
                # Disconnect the worker from the local scheduler. The point of
                # this is so that when the worker kills itself below, the local
                # scheduler won't push an error message to the driver.
                worker.raylet_client.disconnect()
                sys.exit(0)
                assert False, "This process should have terminated."

        def __ray_checkpoint__(self):
            """Save a checkpoint.

            This task saves the current state of the actor, the current task
            frontier according to the local scheduler, and the checkpoint index
            (number of tasks executed so far).
            """
            worker = ray.worker.global_worker
            if not isinstance(self, ray.actor.Checkpointable):
                raise Exception(
                    "__ray_checkpoint__.remote() may only be called on actors "
                    "that implement ray.actor.Checkpointable")
            return worker._save_actor_checkpoint()

    Class.__module__ = cls.__module__
    Class.__name__ = cls.__name__

    class_id = ActorClassID(_random_string())

    return ActorClass(Class, class_id, max_reconstructions, num_cpus, num_gpus,
                      resources, actor_method_cpus)
Exemplo n.º 3
0
    def _serialization_helper(self, ray_forking):
        """This is defined in order to make pickling work.

        Args:
            ray_forking: True if this is being called because Ray is forking
                the actor handle and false if it is being called by pickling.

        Returns:
            A dictionary of the information needed to reconstruct the object.
        """
        if ray_forking:
            actor_handle_id = compute_actor_handle_id(
                self._ray_actor_handle_id, self._ray_actor_forks)
        else:
            actor_handle_id = self._ray_actor_handle_id

        # Note: _ray_actor_cursor and _ray_actor_creation_dummy_object_id
        # could be None.
        state = {
            "actor_id": self._ray_actor_id,
            "actor_handle_id": actor_handle_id,
            "module_name": self._ray_module_name,
            "class_name": self._ray_class_name,
            "actor_cursor": self._ray_actor_cursor,
            "actor_method_names": self._ray_actor_method_names,
            "method_decorators": self._ray_method_decorators,
            "method_signatures": self._ray_method_signatures,
            "method_num_return_vals": self._ray_method_num_return_vals,
            # Actors in local mode don't have dummy objects.
            "actor_creation_dummy_object_id":
            self._ray_actor_creation_dummy_object_id,
            "actor_method_cpus": self._ray_actor_method_cpus,
            "actor_driver_id": self._ray_actor_driver_id,
            "ray_forking": ray_forking
        }

        if ray_forking:
            self._ray_actor_forks += 1
            new_actor_handle_id = actor_handle_id
        else:
            # The execution dependency for a pickled actor handle is never safe
            # to release, since it could be unpickled and submit another
            # dependent task at any time. Therefore, we notify the backend of a
            # random handle ID that will never actually be used.
            new_actor_handle_id = ActorHandleID(_random_string())
        # Notify the backend to expect this new actor handle. The backend will
        # not release the cursor for any new handles until the first task for
        # each of the new handles is submitted.
        # NOTE(swang): There is currently no garbage collection for actor
        # handles until the actor itself is removed.
        self._ray_new_actor_handles.append(new_actor_handle_id)

        return state
Exemplo n.º 4
0
    def _serialization_helper(self, ray_forking):
        """This is defined in order to make pickling work.

        Args:
            ray_forking: True if this is being called because Ray is forking
                the actor handle and false if it is being called by pickling.

        Returns:
            A dictionary of the information needed to reconstruct the object.
        """
        if ray_forking:
            actor_handle_id = compute_actor_handle_id(
                self._ray_actor_handle_id, self._ray_actor_forks)
        else:
            actor_handle_id = self._ray_actor_handle_id

        # Note: _ray_actor_cursor and _ray_actor_creation_dummy_object_id
        # could be None.
        state = {
            "actor_id": self._ray_actor_id,
            "actor_handle_id": actor_handle_id,
            "module_name": self._ray_module_name,
            "class_name": self._ray_class_name,
            "actor_cursor": self._ray_actor_cursor,
            "actor_method_names": self._ray_actor_method_names,
            "method_signatures": self._ray_method_signatures,
            "method_num_return_vals": self._ray_method_num_return_vals,
            # Actors in local mode don't have dummy objects.
            "actor_creation_dummy_object_id": self.
            _ray_actor_creation_dummy_object_id,
            "actor_method_cpus": self._ray_actor_method_cpus,
            "actor_driver_id": self._ray_actor_driver_id,
            "ray_forking": ray_forking
        }

        if ray_forking:
            self._ray_actor_forks += 1
            new_actor_handle_id = actor_handle_id
        else:
            # The execution dependency for a pickled actor handle is never safe
            # to release, since it could be unpickled and submit another
            # dependent task at any time. Therefore, we notify the backend of a
            # random handle ID that will never actually be used.
            new_actor_handle_id = ActorHandleID(_random_string())
        # Notify the backend to expect this new actor handle. The backend will
        # not release the cursor for any new handles until the first task for
        # each of the new handles is submitted.
        # NOTE(swang): There is currently no garbage collection for actor
        # handles until the actor itself is removed.
        self._ray_new_actor_handles.append(new_actor_handle_id)

        return state
Exemplo n.º 5
0
def test_raylet_crash_when_get(ray_start_regular):
    nonexistent_id = ray.ObjectID(_random_string())

    def sleep_to_kill_raylet():
        # Don't kill raylet before default workers get connected.
        time.sleep(2)
        ray.worker._global_node.kill_raylet()

    thread = threading.Thread(target=sleep_to_kill_raylet)
    thread.start()
    with pytest.raises(Exception, match=r".*Connection closed unexpectedly.*"):
        ray.get(nonexistent_id)
    thread.join()
Exemplo n.º 6
0
def test_raylet_crash_when_get(ray_start_regular):
    nonexistent_id = ray.ObjectID(_random_string())

    def sleep_to_kill_raylet():
        # Don't kill raylet before default workers get connected.
        time.sleep(2)
        ray.services.all_processes[ray.services.PROCESS_TYPE_RAYLET][0].kill()

    thread = threading.Thread(target=sleep_to_kill_raylet)
    thread.start()
    with pytest.raises(Exception, match=r".*raylet client may be closed.*"):
        ray.get(nonexistent_id)
    thread.join()
Exemplo n.º 7
0
def test_raylet_crash_when_get(ray_start_regular):
    nonexistent_id = ray.ObjectID(_random_string())

    def sleep_to_kill_raylet():
        # Don't kill raylet before default workers get connected.
        time.sleep(2)
        ray.services.all_processes[ray.services.PROCESS_TYPE_RAYLET][0].kill()

    thread = threading.Thread(target=sleep_to_kill_raylet)
    thread.start()
    with pytest.raises(Exception, match=r".*raylet client may be closed.*"):
        ray.get(nonexistent_id)
    thread.join()
Exemplo n.º 8
0
def test_raylet_crash_when_get(ray_start_regular):
    nonexistent_id = ray.ObjectID(_random_string())

    def sleep_to_kill_raylet():
        # Don't kill raylet before default workers get connected.
        time.sleep(2)
        ray.worker._global_node.kill_raylet()

    thread = threading.Thread(target=sleep_to_kill_raylet)
    thread.start()
    with pytest.raises(Exception, match=r".*Connection closed unexpectedly.*"):
        ray.get(nonexistent_id)
    thread.join()
Exemplo n.º 9
0
    def register_custom_serializer(self,
                                   cls,
                                   serializer,
                                   deserializer,
                                   local=False,
                                   job_id=None,
                                   class_id=None):
        """Enable serialization and deserialization for a particular class.

        This method runs the register_class function defined below on
        every worker, which will enable ray to properly serialize and
        deserialize objects of this class.

        Args:
            cls (type): The class that ray should use this custom serializer
                for.
            serializer: The custom serializer to use.
            deserializer: The custom deserializer to use.
            local: True if the serializers should only be registered on the
                current worker. This should usually be False.
            job_id: ID of the job that we want to register the class for.
            class_id (str): Unique ID of the class. Autogenerated if None.

        Raises:
            RayNotDictionarySerializable: Raised if use_dict is true and cls
                cannot be efficiently serialized by Ray.
            ValueError: Raised if ray could not autogenerate a class_id.
        """
        assert serializer is not None and deserializer is not None, (
            "Must provide serializer and deserializer.")

        if class_id is None:
            if not local:
                # In this case, the class ID will be used to deduplicate the
                # class across workers. Note that cloudpickle unfortunately
                # does not produce deterministic strings, so these IDs could
                # be different on different workers. We could use something
                # weaker like cls.__name__, however that would run the risk
                # of having collisions.
                # TODO(rkn): We should improve this.
                try:
                    # Attempt to produce a class ID that will be the same on
                    # each worker. However, determinism is not guaranteed,
                    # and the result may be different on different workers.
                    class_id = _try_to_compute_deterministic_class_id(cls)
                except Exception:
                    raise ValueError(
                        "Failed to use pickle in generating a unique id"
                        f"for '{cls}'. Provide a unique class_id.")
            else:
                # In this case, the class ID only needs to be meaningful on
                # this worker and not across workers.
                class_id = _random_string()

            # Make sure class_id is a string.
            class_id = ray.utils.binary_to_hex(class_id)

        if job_id is None:
            job_id = self.worker.current_job_id
        assert isinstance(job_id, JobID)

        def register_class_for_serialization(worker_info):
            context = worker_info["worker"].get_serialization_context(job_id)
            context._register_cloudpickle_serializer(cls, serializer,
                                                     deserializer)

        if not local:
            self.worker.run_function_on_all_workers(
                register_class_for_serialization)
        else:
            # Since we are pickling objects of this class, we don't actually
            # need to ship the class definition.
            register_class_for_serialization({"worker": self.worker})
Exemplo n.º 10
0
def test_driver_exiting_when_worker_blocked(call_ray_start):
    # This test will create some drivers that submit some tasks and then
    # exit without waiting for the tasks to complete.
    redis_address = call_ray_start

    ray.init(redis_address=redis_address)

    # Define a driver that creates two tasks, one that runs forever and the
    # other blocked on the first in a `ray.get`.
    driver_script = """
import time
import ray
ray.init(redis_address="{}")
@ray.remote
def f():
    time.sleep(10**6)
@ray.remote
def g():
    ray.get(f.remote())
g.remote()
time.sleep(1)
print("success")
""".format(redis_address)

    # Create some drivers and let them exit and make sure everything is
    # still alive.
    for _ in range(3):
        out = run_string_as_driver(driver_script)
        # Make sure the first driver ran to completion.
        assert "success" in out

    # Define a driver that creates two tasks, one that runs forever and the
    # other blocked on the first in a `ray.wait`.
    driver_script = """
import time
import ray
ray.init(redis_address="{}")
@ray.remote
def f():
    time.sleep(10**6)
@ray.remote
def g():
    ray.wait([f.remote()])
g.remote()
time.sleep(1)
print("success")
""".format(redis_address)

    # Create some drivers and let them exit and make sure everything is
    # still alive.
    for _ in range(3):
        out = run_string_as_driver(driver_script)
        # Make sure the first driver ran to completion.
        assert "success" in out

    nonexistent_id_bytes = _random_string()
    nonexistent_id_hex = ray.utils.binary_to_hex(nonexistent_id_bytes)
    # Define a driver that creates one task that depends on a nonexistent
    # object. This task will be queued as waiting to execute.
    driver_script = """
import time
import ray
ray.init(redis_address="{}")
@ray.remote
def g(x):
    return
g.remote(ray.ObjectID(ray.utils.hex_to_binary("{}")))
time.sleep(1)
print("success")
""".format(redis_address, nonexistent_id_hex)

    # Create some drivers and let them exit and make sure everything is
    # still alive.
    for _ in range(3):
        out = run_string_as_driver(driver_script)
        # Simulate the nonexistent dependency becoming available.
        ray.worker.global_worker.put_object(
            ray.ObjectID(nonexistent_id_bytes), None)
        # Make sure the first driver ran to completion.
        assert "success" in out

    nonexistent_id_bytes = _random_string()
    nonexistent_id_hex = ray.utils.binary_to_hex(nonexistent_id_bytes)
    # Define a driver that calls `ray.wait` on a nonexistent object.
    driver_script = """
import time
import ray
ray.init(redis_address="{}")
@ray.remote
def g():
    ray.wait(ray.ObjectID(ray.utils.hex_to_binary("{}")))
g.remote()
time.sleep(1)
print("success")
""".format(redis_address, nonexistent_id_hex)

    # Create some drivers and let them exit and make sure everything is
    # still alive.
    for _ in range(3):
        out = run_string_as_driver(driver_script)
        # Simulate the nonexistent dependency becoming available.
        ray.worker.global_worker.put_object(
            ray.ObjectID(nonexistent_id_bytes), None)
        # Make sure the first driver ran to completion.
        assert "success" in out

    @ray.remote
    def f():
        return 1

    # Make sure we can still talk with the raylet.
    ray.get(f.remote())
Exemplo n.º 11
0
    def register_custom_serializer(self,
                                   cls,
                                   use_pickle=False,
                                   use_dict=False,
                                   serializer=None,
                                   deserializer=None,
                                   local=False,
                                   job_id=None,
                                   class_id=None):
        """Enable serialization and deserialization for a particular class.

        This method runs the register_class function defined below on
        every worker, which will enable ray to properly serialize and
        deserialize objects of this class.

        Args:
            cls (type): The class that ray should use this custom serializer
                for.
            use_pickle (bool): If true, then objects of this class will be
                serialized using pickle.
            use_dict: If true, then objects of this class be serialized
                turning their __dict__ fields into a dictionary. Must be False
                if use_pickle is true.
            serializer: The custom serializer to use. This should be provided
                if and only if use_pickle and use_dict are False.
            deserializer: The custom deserializer to use. This should be
                provided if and only if use_pickle and use_dict are False.
            local: True if the serializers should only be registered on the
                current worker. This should usually be False.
            job_id: ID of the job that we want to register the class for.
            class_id (str): Unique ID of the class. Autogenerated if None.

        Raises:
            RayNotDictionarySerializable: Raised if use_dict is true and cls
                cannot be efficiently serialized by Ray.
            ValueError: Raised if ray could not autogenerate a class_id.
        """
        assert (serializer is None) == (deserializer is None), (
            "The serializer/deserializer arguments must both be provided or "
            "both not be provided.")
        use_custom_serializer = (serializer is not None)

        assert use_custom_serializer + use_pickle + use_dict == 1, (
            "Exactly one of use_pickle, use_dict, or serializer/deserializer "
            "must be specified.")

        if self.worker.use_pickle and serializer is None:
            # In this case it should do nothing.
            return

        if use_dict:
            # Raise an exception if cls cannot be serialized
            # efficiently by Ray.
            check_serializable(cls)

        if class_id is None:
            if not local:
                # In this case, the class ID will be used to deduplicate the
                # class across workers. Note that cloudpickle unfortunately
                # does not produce deterministic strings, so these IDs could
                # be different on different workers. We could use something
                # weaker like cls.__name__, however that would run the risk
                # of having collisions.
                # TODO(rkn): We should improve this.
                try:
                    # Attempt to produce a class ID that will be the same on
                    # each worker. However, determinism is not guaranteed,
                    # and the result may be different on different workers.
                    class_id = _try_to_compute_deterministic_class_id(cls)
                except Exception:
                    raise ValueError(
                        "Failed to use pickle in generating a unique id"
                        "for '{}'. Provide a unique class_id.".format(cls))
            else:
                # In this case, the class ID only needs to be meaningful on
                # this worker and not across workers.
                class_id = _random_string()

            # Make sure class_id is a string.
            class_id = ray.utils.binary_to_hex(class_id)

        if job_id is None:
            job_id = self.worker.current_job_id
        assert isinstance(job_id, JobID)

        def register_class_for_serialization(worker_info):
            context = worker_info["worker"].get_serialization_context(job_id)
            if worker_info["worker"].use_pickle:
                context._register_cloudpickle_serializer(
                    cls, serializer, deserializer)
            else:
                # TODO(rkn): We need to be more thoughtful about what to do if
                # custom serializers have already been registered for
                # class_id. In some cases, we may want to use the last
                # user-defined serializers and ignore subsequent calls to
                # register_custom_serializer that were made by the system.
                context.pyarrow_context.register_type(
                    cls,
                    class_id,
                    pickle=use_pickle,
                    custom_serializer=serializer,
                    custom_deserializer=deserializer)

        if not local:
            self.worker.run_function_on_all_workers(
                register_class_for_serialization)
        else:
            # Since we are pickling objects of this class, we don't actually
            # need to ship the class definition.
            register_class_for_serialization({"worker": self.worker})
Exemplo n.º 12
0
    def _submit(self,
                args,
                kwargs,
                num_cpus=None,
                num_gpus=None,
                resources=None):
        """Create an actor.

        This method allows more flexibility than the remote method because
        resource requirements can be specified and override the defaults in the
        decorator.

        Args:
            args: The arguments to forward to the actor constructor.
            kwargs: The keyword arguments to forward to the actor constructor.
            num_cpus: The number of CPUs required by the actor creation task.
            num_gpus: The number of GPUs required by the actor creation task.
            resources: The custom resources required by the actor creation
                task.

        Returns:
            A handle to the newly created actor.
        """
        if ray.worker.global_worker.mode is None:
            raise Exception("Actors cannot be created before ray.init() "
                            "has been called.")

        actor_id = ray.local_scheduler.ObjectID(_random_string())
        # The actor cursor is a dummy object representing the most recent
        # actor method invocation. For each subsequent method invocation,
        # the current cursor should be added as a dependency, and then
        # updated to reflect the new invocation.
        actor_cursor = None

        # Get the actor methods of the given class.
        def pred(x):
            return (inspect.isfunction(x) or inspect.ismethod(x)
                    or is_cython(x))

        actor_methods = inspect.getmembers(self._modified_class,
                                           predicate=pred)
        # Extract the signatures of each of the methods. This will be used
        # to catch some errors if the methods are called with inappropriate
        # arguments.
        method_signatures = dict()
        for k, v in actor_methods:
            # Print a warning message if the method signature is not
            # supported. We don't raise an exception because if the actor
            # inherits from a class that has a method whose signature we
            # don't support, there may not be much the user can do about it.
            signature.check_signature_supported(v, warn=True)
            method_signatures[k] = signature.extract_signature(
                v, ignore_first=True)

        actor_method_names = [method_name for method_name, _ in actor_methods]
        actor_method_num_return_vals = []
        for _, method in actor_methods:
            if hasattr(method, "__ray_num_return_vals__"):
                actor_method_num_return_vals.append(
                    method.__ray_num_return_vals__)
            else:
                actor_method_num_return_vals.append(1)
        # Do not export the actor class or the actor if run in PYTHON_MODE
        # Instead, instantiate the actor locally and add it to
        # global_worker's dictionary
        if ray.worker.global_worker.mode == ray.PYTHON_MODE:
            ray.worker.global_worker.actors[actor_id] = (
                self._modified_class.__new__(self._modified_class))
        else:
            # Export the actor.
            if not self._exported:
                export_actor_class(self._class_id, self._modified_class,
                                   actor_method_names,
                                   actor_method_num_return_vals,
                                   self._checkpoint_interval,
                                   ray.worker.global_worker)
                self._exported = True
            actor_cursor = export_actor(actor_id, self._class_id,
                                        self._class_name, actor_method_names,
                                        actor_method_num_return_vals,
                                        self._actor_creation_resources,
                                        self._actor_method_cpus,
                                        ray.worker.global_worker)

        # We initialize the actor counter at 1 to account for the actor
        # creation task.
        actor_counter = 1
        actor_handle = ActorHandle(actor_id, self._class_name, actor_cursor,
                                   actor_counter, actor_method_names,
                                   actor_method_num_return_vals,
                                   method_signatures, actor_cursor,
                                   self._actor_method_cpus,
                                   ray.worker.global_worker.task_driver_id)

        # Call __init__ as a remote function.
        if "__init__" in actor_handle._ray_actor_method_names:
            actor_handle.__init__.remote(*args, **kwargs)
        else:
            if len(args) != 0 or len(kwargs) != 0:
                raise Exception("Arguments cannot be passed to the actor "
                                "constructor because this actor class has no "
                                "__init__ method.")

        return actor_handle
Exemplo n.º 13
0
        def remote(cls, *args, **kwargs):
            if ray.worker.global_worker.mode is None:
                raise Exception("Actors cannot be created before ray.init() "
                                "has been called.")

            actor_id = ray.local_scheduler.ObjectID(_random_string())
            # The ID for this instance of ActorHandle. These should be unique
            # across instances with the same _ray_actor_id.
            actor_handle_id = ray.local_scheduler.ObjectID(
                ray.worker.NIL_ACTOR_ID)
            # The actor cursor is a dummy object representing the most recent
            # actor method invocation. For each subsequent method invocation,
            # the current cursor should be added as a dependency, and then
            # updated to reflect the new invocation.
            actor_cursor = None
            # The number of actor method invocations that we've called so far.
            actor_counter = 0
            # Get the actor methods of the given class.
            actor_methods = inspect.getmembers(
                Class,
                predicate=(lambda x: (inspect.isfunction(x) or inspect.
                                      ismethod(x) or is_cython(x))))
            # Extract the signatures of each of the methods. This will be used
            # to catch some errors if the methods are called with inappropriate
            # arguments.
            method_signatures = dict()
            for k, v in actor_methods:
                # Print a warning message if the method signature is not
                # supported. We don't raise an exception because if the actor
                # inherits from a class that has a method whose signature we
                # don't support, we there may not be much the user can do about
                # it.
                signature.check_signature_supported(v, warn=True)
                method_signatures[k] = signature.extract_signature(
                    v, ignore_first=True)

            actor_method_names = [
                method_name for method_name, _ in actor_methods
            ]
            actor_method_num_return_vals = []
            for _, method in actor_methods:
                if hasattr(method, "__ray_num_return_vals__"):
                    actor_method_num_return_vals.append(
                        method.__ray_num_return_vals__)
                else:
                    actor_method_num_return_vals.append(1)
            # Do not export the actor class or the actor if run in PYTHON_MODE
            # Instead, instantiate the actor locally and add it to
            # global_worker's dictionary
            if ray.worker.global_worker.mode == ray.PYTHON_MODE:
                ray.worker.global_worker.actors[actor_id] = (
                    Class.__new__(Class))
            else:
                # Export the actor.
                if not exported:
                    export_actor_class(class_id, Class, actor_method_names,
                                       actor_method_num_return_vals,
                                       checkpoint_interval,
                                       ray.worker.global_worker)
                    exported.append(0)
                actor_cursor = export_actor(actor_id, class_id, class_name,
                                            actor_method_names,
                                            actor_method_num_return_vals,
                                            actor_creation_resources,
                                            actor_method_cpus,
                                            ray.worker.global_worker)

            # Instantiate the actor handle.
            actor_object = cls.__new__(cls)
            actor_object._manual_init(actor_id, class_id, actor_handle_id,
                                      actor_cursor, actor_counter,
                                      actor_method_names,
                                      actor_method_num_return_vals,
                                      method_signatures, checkpoint_interval,
                                      actor_cursor, actor_creation_resources,
                                      actor_method_cpus)

            # Call __init__ as a remote function.
            if "__init__" in actor_object._ray_actor_method_names:
                actor_object._actor_method_call("__init__",
                                                args=args,
                                                kwargs=kwargs,
                                                dependency=actor_cursor)
            else:
                print("WARNING: this object has no __init__ method.")

            return actor_object
Exemplo n.º 14
0
def make_actor(cls, num_cpus, num_gpus, resources, actor_method_cpus,
               checkpoint_interval, max_reconstructions):
    # Give an error if cls is an old-style class.
    if not issubclass(cls, object):
        raise TypeError(
            "The @ray.remote decorator cannot be applied to old-style "
            "classes. In Python 2, you must declare the class with "
            "'class ClassName(object):' instead of 'class ClassName:'.")

    if checkpoint_interval is None:
        checkpoint_interval = -1
    if max_reconstructions is None:
        max_reconstructions = 0

    if checkpoint_interval == 0:
        raise Exception("checkpoint_interval must be greater than 0.")
    if not (ray_constants.NO_RECONSTRUCTION <= max_reconstructions <=
            ray_constants.INFINITE_RECONSTRUCTION):
        raise Exception("max_reconstructions must be in range [%d, %d]." %
                        (ray_constants.NO_RECONSTRUCTION,
                         ray_constants.INFINITE_RECONSTRUCTION))

    # Modify the class to have an additional method that will be used for
    # terminating the worker.
    class Class(cls):
        def __ray_terminate__(self):
            worker = ray.worker.get_global_worker()
            if worker.mode != ray.LOCAL_MODE:
                # Disconnect the worker from the local scheduler. The point of
                # this is so that when the worker kills itself below, the local
                # scheduler won't push an error message to the driver.
                worker.raylet_client.disconnect()
                sys.exit(0)
                assert False, "This process should have terminated."

        def __ray_save_checkpoint__(self):
            if hasattr(self, "__ray_save__"):
                object_to_serialize = self.__ray_save__()
            else:
                object_to_serialize = self
            return pickle.dumps(object_to_serialize)

        @classmethod
        def __ray_restore_from_checkpoint__(cls, pickled_checkpoint):
            checkpoint = pickle.loads(pickled_checkpoint)
            if hasattr(cls, "__ray_restore__"):
                actor_object = cls.__new__(cls)
                actor_object.__ray_restore__(checkpoint)
            else:
                # TODO(rkn): It's possible that this will cause problems. When
                # you unpickle the same object twice, the two objects will not
                # have the same class.
                actor_object = checkpoint
            return actor_object

        def __ray_checkpoint__(self):
            """Save a checkpoint.

            This task saves the current state of the actor, the current task
            frontier according to the local scheduler, and the checkpoint index
            (number of tasks executed so far).
            """
            worker = ray.worker.global_worker
            checkpoint_index = worker.actor_task_counter
            # Get the state to save.
            checkpoint = self.__ray_save_checkpoint__()
            # Get the current task frontier, per actor handle.
            # NOTE(swang): This only includes actor handles that the local
            # scheduler has seen. Handle IDs for which no task has yet reached
            # the local scheduler will not be included, and may not be runnable
            # on checkpoint resumption.
            actor_id = worker.actor_id
            frontier = worker.raylet_client.get_actor_frontier(actor_id)
            # Save the checkpoint in Redis. TODO(rkn): Checkpoints
            # should not be stored in Redis. Fix this.
            set_actor_checkpoint(worker, worker.actor_id, checkpoint_index,
                                 checkpoint, frontier)

        def __ray_checkpoint_restore__(self):
            """Restore a checkpoint.

            This task looks for a saved checkpoint and if found, restores the
            state of the actor, the task frontier in the local scheduler, and
            the checkpoint index (number of tasks executed so far).

            Returns:
                A bool indicating whether a checkpoint was resumed.
            """
            worker = ray.worker.global_worker
            # Get the most recent checkpoint stored, if any.
            checkpoint_index, checkpoint, frontier = get_actor_checkpoint(
                worker, worker.actor_id)
            # Try to resume from the checkpoint.
            checkpoint_resumed = False
            if checkpoint_index is not None:
                # Load the actor state from the checkpoint.
                worker.actors[worker.actor_id] = (
                    worker.actor_class.__ray_restore_from_checkpoint__(
                        checkpoint))
                # Set the number of tasks executed so far.
                worker.actor_task_counter = checkpoint_index
                # Set the actor frontier in the local scheduler.
                worker.raylet_client.set_actor_frontier(frontier)
                checkpoint_resumed = True

            return checkpoint_resumed

    Class.__module__ = cls.__module__
    Class.__name__ = cls.__name__

    class_id = ActorClassID(_random_string())

    return ActorClass(Class, class_id, checkpoint_interval,
                      max_reconstructions, num_cpus, num_gpus, resources,
                      actor_method_cpus)
Exemplo n.º 15
0
    def _remote(self,
                args=None,
                kwargs=None,
                num_cpus=None,
                num_gpus=None,
                resources=None):
        """Create an actor.

        This method allows more flexibility than the remote method because
        resource requirements can be specified and override the defaults in the
        decorator.

        Args:
            args: The arguments to forward to the actor constructor.
            kwargs: The keyword arguments to forward to the actor constructor.
            num_cpus: The number of CPUs required by the actor creation task.
            num_gpus: The number of GPUs required by the actor creation task.
            resources: The custom resources required by the actor creation
                task.

        Returns:
            A handle to the newly created actor.
        """
        if args is None:
            args = []
        if kwargs is None:
            kwargs = {}

        worker = ray.worker.get_global_worker()
        if worker.mode is None:
            raise Exception("Actors cannot be created before ray.init() "
                            "has been called.")

        actor_id = ActorID(_random_string())
        # The actor cursor is a dummy object representing the most recent
        # actor method invocation. For each subsequent method invocation,
        # the current cursor should be added as a dependency, and then
        # updated to reflect the new invocation.
        actor_cursor = None

        # Do not export the actor class or the actor if run in LOCAL_MODE
        # Instead, instantiate the actor locally and add it to the worker's
        # dictionary
        if worker.mode == ray.LOCAL_MODE:
            worker.actors[actor_id] = self._modified_class(
                *copy.deepcopy(args), **copy.deepcopy(kwargs))
        else:
            # Export the actor.
            if not self._exported:
                worker.function_actor_manager.export_actor_class(
                    self._modified_class, self._actor_method_names)
                self._exported = True

            resources = ray.utils.resources_from_resource_arguments(
                self._num_cpus, self._num_gpus, self._resources, num_cpus,
                num_gpus, resources)

            # If the actor methods require CPU resources, then set the required
            # placement resources. If actor_placement_resources is empty, then
            # the required placement resources will be the same as resources.
            actor_placement_resources = {}
            assert self._actor_method_cpus in [0, 1]
            if self._actor_method_cpus == 1:
                actor_placement_resources = resources.copy()
                actor_placement_resources["CPU"] += 1

            function_name = "__init__"
            function_signature = self._method_signatures[function_name]
            creation_args = signature.extend_args(function_signature, args,
                                                  kwargs)
            function_descriptor = FunctionDescriptor(
                self._modified_class.__module__, function_name,
                self._modified_class.__name__)
            [actor_cursor] = worker.submit_task(
                function_descriptor,
                creation_args,
                actor_creation_id=actor_id,
                max_actor_reconstructions=self._max_reconstructions,
                num_return_vals=1,
                resources=resources,
                placement_resources=actor_placement_resources)
            assert isinstance(actor_cursor, ObjectID)

        actor_handle = ActorHandle(
            actor_id, self._modified_class.__module__, self._class_name,
            actor_cursor, self._actor_method_names, self._method_signatures,
            self._actor_method_num_return_vals, actor_cursor,
            self._actor_method_cpus, worker.task_driver_id)
        # We increment the actor counter by 1 to account for the actor creation
        # task.
        actor_handle._ray_actor_counter += 1

        return actor_handle
Exemplo n.º 16
0
    def _remote(self,
                args,
                kwargs,
                num_cpus=None,
                num_gpus=None,
                resources=None):
        """Create an actor.

        This method allows more flexibility than the remote method because
        resource requirements can be specified and override the defaults in the
        decorator.

        Args:
            args: The arguments to forward to the actor constructor.
            kwargs: The keyword arguments to forward to the actor constructor.
            num_cpus: The number of CPUs required by the actor creation task.
            num_gpus: The number of GPUs required by the actor creation task.
            resources: The custom resources required by the actor creation
                task.

        Returns:
            A handle to the newly created actor.
        """
        worker = ray.worker.get_global_worker()
        if worker.mode is None:
            raise Exception("Actors cannot be created before ray.init() "
                            "has been called.")

        actor_id = ray.ObjectID(_random_string())
        # The actor cursor is a dummy object representing the most recent
        # actor method invocation. For each subsequent method invocation,
        # the current cursor should be added as a dependency, and then
        # updated to reflect the new invocation.
        actor_cursor = None

        # Do not export the actor class or the actor if run in LOCAL_MODE
        # Instead, instantiate the actor locally and add it to the worker's
        # dictionary
        if worker.mode == ray.LOCAL_MODE:
            worker.actors[actor_id] = self._modified_class.__new__(
                self._modified_class)
        else:
            # Export the actor.
            if not self._exported:
                worker.function_actor_manager.export_actor_class(
                    self._class_id, self._modified_class,
                    self._actor_method_names, self._checkpoint_interval)
                self._exported = True

            resources = ray.utils.resources_from_resource_arguments(
                self._num_cpus, self._num_gpus, self._resources, num_cpus,
                num_gpus, resources)

            # If the actor methods require CPU resources, then set the required
            # placement resources. If actor_placement_resources is empty, then
            # the required placement resources will be the same as resources.
            actor_placement_resources = {}
            assert self._actor_method_cpus in [0, 1]
            if self._actor_method_cpus == 1:
                actor_placement_resources = resources.copy()
                actor_placement_resources["CPU"] += 1

            creation_args = [self._class_id]
            function_id = compute_actor_creation_function_id(self._class_id)
            [actor_cursor] = worker.submit_task(
                function_id,
                creation_args,
                actor_creation_id=actor_id,
                num_return_vals=1,
                resources=resources,
                placement_resources=actor_placement_resources)

        # We initialize the actor counter at 1 to account for the actor
        # creation task.
        actor_counter = 1
        actor_handle = ActorHandle(
            actor_id, self._class_name, actor_cursor, actor_counter,
            self._actor_method_names, self._method_signatures,
            self._actor_method_num_return_vals, actor_cursor,
            self._actor_method_cpus, worker.task_driver_id)

        # Call __init__ as a remote function.
        if "__init__" in actor_handle._ray_actor_method_names:
            actor_handle.__init__.remote(*args, **kwargs)
        else:
            if len(args) != 0 or len(kwargs) != 0:
                raise Exception("Arguments cannot be passed to the actor "
                                "constructor because this actor class has no "
                                "__init__ method.")

        return actor_handle
Exemplo n.º 17
0
 def generate_id(cls):
     return binary_to_hex(_random_string())[:8]
Exemplo n.º 18
0
 def generate_id(cls):
     return binary_to_hex(_random_string())[:8]
Exemplo n.º 19
0
def make_actor(cls, resources, checkpoint_interval, actor_method_cpus):
    if checkpoint_interval == 0:
        raise Exception("checkpoint_interval must be greater than 0.")

    # Modify the class to have an additional method that will be used for
    # terminating the worker.
    class Class(cls):
        def __ray_terminate__(self):
            # Disconnect the worker from the local scheduler. The point of this
            # is so that when the worker kills itself below, the local
            # scheduler won't push an error message to the driver.
            ray.worker.global_worker.local_scheduler_client.disconnect()
            import os
            os._exit(0)

        def __ray_save_checkpoint__(self):
            if hasattr(self, "__ray_save__"):
                object_to_serialize = self.__ray_save__()
            else:
                object_to_serialize = self
            return pickle.dumps(object_to_serialize)

        @classmethod
        def __ray_restore_from_checkpoint__(cls, pickled_checkpoint):
            checkpoint = pickle.loads(pickled_checkpoint)
            if hasattr(cls, "__ray_restore__"):
                actor_object = cls.__new__(cls)
                actor_object.__ray_restore__(checkpoint)
            else:
                # TODO(rkn): It's possible that this will cause problems. When
                # you unpickle the same object twice, the two objects will not
                # have the same class.
                actor_object = checkpoint
            return actor_object

        def __ray_checkpoint__(self):
            """Save a checkpoint.

            This task saves the current state of the actor, the current task
            frontier according to the local scheduler, and the checkpoint index
            (number of tasks executed so far).
            """
            worker = ray.worker.global_worker
            checkpoint_index = worker.actor_task_counter
            # Get the state to save.
            checkpoint = self.__ray_save_checkpoint__()
            # Get the current task frontier, per actor handle.
            # NOTE(swang): This only includes actor handles that the local
            # scheduler has seen. Handle IDs for which no task has yet reached
            # the local scheduler will not be included, and may not be runnable
            # on checkpoint resumption.
            actor_id = ray.local_scheduler.ObjectID(worker.actor_id)
            frontier = worker.local_scheduler_client.get_actor_frontier(
                actor_id)
            # Save the checkpoint in Redis. TODO(rkn): Checkpoints
            # should not be stored in Redis. Fix this.
            set_actor_checkpoint(worker, worker.actor_id, checkpoint_index,
                                 checkpoint, frontier)

        def __ray_checkpoint_restore__(self):
            """Restore a checkpoint.

            This task looks for a saved checkpoint and if found, restores the
            state of the actor, the task frontier in the local scheduler, and
            the checkpoint index (number of tasks executed so far).

            Returns:
                A bool indicating whether a checkpoint was resumed.
            """
            worker = ray.worker.global_worker
            # Get the most recent checkpoint stored, if any.
            checkpoint_index, checkpoint, frontier = get_actor_checkpoint(
                worker, worker.actor_id)
            # Try to resume from the checkpoint.
            checkpoint_resumed = False
            if checkpoint_index is not None:
                # Load the actor state from the checkpoint.
                worker.actors[worker.actor_id] = (
                    worker.actor_class.__ray_restore_from_checkpoint__(
                        checkpoint))
                # Set the number of tasks executed so far.
                worker.actor_task_counter = checkpoint_index
                # Set the actor frontier in the local scheduler.
                worker.local_scheduler_client.set_actor_frontier(frontier)
                checkpoint_resumed = True

            return checkpoint_resumed

    Class.__module__ = cls.__module__
    Class.__name__ = cls.__name__

    class_id = _random_string()

    return ActorClass(Class, class_id, checkpoint_interval, resources,
                      actor_method_cpus)
Exemplo n.º 20
0
    def _remote(self,
                args,
                kwargs,
                num_cpus=None,
                num_gpus=None,
                resources=None):
        """Create an actor.

        This method allows more flexibility than the remote method because
        resource requirements can be specified and override the defaults in the
        decorator.

        Args:
            args: The arguments to forward to the actor constructor.
            kwargs: The keyword arguments to forward to the actor constructor.
            num_cpus: The number of CPUs required by the actor creation task.
            num_gpus: The number of GPUs required by the actor creation task.
            resources: The custom resources required by the actor creation
                task.

        Returns:
            A handle to the newly created actor.
        """
        worker = ray.worker.get_global_worker()
        if worker.mode is None:
            raise Exception("Actors cannot be created before ray.init() "
                            "has been called.")

        actor_id = ActorID(_random_string())
        # The actor cursor is a dummy object representing the most recent
        # actor method invocation. For each subsequent method invocation,
        # the current cursor should be added as a dependency, and then
        # updated to reflect the new invocation.
        actor_cursor = None

        # Do not export the actor class or the actor if run in LOCAL_MODE
        # Instead, instantiate the actor locally and add it to the worker's
        # dictionary
        if worker.mode == ray.LOCAL_MODE:
            worker.actors[actor_id] = self._modified_class(
                *copy.deepcopy(args), **copy.deepcopy(kwargs))
        else:
            # Export the actor.
            if not self._exported:
                worker.function_actor_manager.export_actor_class(
                    self._modified_class, self._actor_method_names)
                self._exported = True

            resources = ray.utils.resources_from_resource_arguments(
                self._num_cpus, self._num_gpus, self._resources, num_cpus,
                num_gpus, resources)

            # If the actor methods require CPU resources, then set the required
            # placement resources. If actor_placement_resources is empty, then
            # the required placement resources will be the same as resources.
            actor_placement_resources = {}
            assert self._actor_method_cpus in [0, 1]
            if self._actor_method_cpus == 1:
                actor_placement_resources = resources.copy()
                actor_placement_resources["CPU"] += 1

            if args is None:
                args = []
            if kwargs is None:
                kwargs = {}
            function_name = "__init__"
            function_signature = self._method_signatures[function_name]
            creation_args = signature.extend_args(function_signature, args,
                                                  kwargs)
            function_descriptor = FunctionDescriptor(
                self._modified_class.__module__, function_name,
                self._modified_class.__name__)
            [actor_cursor] = worker.submit_task(
                function_descriptor,
                creation_args,
                actor_creation_id=actor_id,
                max_actor_reconstructions=self._max_reconstructions,
                num_return_vals=1,
                resources=resources,
                placement_resources=actor_placement_resources)
            assert isinstance(actor_cursor, ObjectID)

        actor_handle = ActorHandle(
            actor_id, self._modified_class.__module__, self._class_name,
            actor_cursor, self._actor_method_names, self._method_signatures,
            self._actor_method_num_return_vals, actor_cursor,
            self._actor_method_cpus, worker.task_driver_id)
        # We increment the actor counter by 1 to account for the actor creation
        # task.
        actor_handle._ray_actor_counter += 1

        return actor_handle