def register_actor_signatures(worker, driver_id, class_name, actor_method_names, actor_method_num_return_vals): """Register an actor's method signatures in the worker. Args: worker: The worker to register the signatures on. driver_id: The ID of the driver that this actor is associated with. actor_id: The ID of the actor. actor_method_names: The names of the methods to register. actor_method_num_return_vals: A list of the number of return values for each of the actor's methods. """ assert len(actor_method_names) == len(actor_method_num_return_vals) for actor_method_name, num_return_vals in zip( actor_method_names, actor_method_num_return_vals): # TODO(rkn): When we create a second actor, we are probably overwriting # the values from the first actor here. This may or may not be a # problem. function_id = compute_actor_method_function_id(class_name, actor_method_name).id() worker.function_properties[driver_id][function_id] = ( # The extra return value is an actor dummy object. FunctionProperties(num_return_vals=num_return_vals + 1, resources={"CPU": 1}, max_calls=0))
def register_actor_signatures(worker, driver_id, class_id, class_name, actor_method_names, actor_method_num_return_vals, actor_creation_resources=None, actor_method_cpus=None): """Register an actor's method signatures in the worker. Args: worker: The worker to register the signatures on. driver_id: The ID of the driver that this actor is associated with. class_id: The ID of the actor class. class_name: The name of the actor class. actor_method_names: The names of the methods to register. actor_method_num_return_vals: A list of the number of return values for each of the actor's methods. actor_creation_resources: The resources required by the actor creation task. actor_method_cpus: The number of CPUs required by each actor method. """ assert len(actor_method_names) == len(actor_method_num_return_vals) for actor_method_name, num_return_vals in zip( actor_method_names, actor_method_num_return_vals): # TODO(rkn): When we create a second actor, we are probably overwriting # the values from the first actor here. This may or may not be a # problem. function_id = compute_actor_method_function_id(class_name, actor_method_name).id() worker.function_properties[driver_id][function_id] = ( # The extra return value is an actor dummy object. # In the cases where actor_method_cpus is None, that value should # never be used. FunctionProperties(num_return_vals=num_return_vals + 1, resources={"CPU": actor_method_cpus}, max_calls=0)) if actor_creation_resources is not None: # Also register the actor creation task. function_id = compute_actor_creation_function_id(class_id) worker.function_properties[driver_id][function_id.id()] = ( # The extra return value is an actor dummy object. FunctionProperties(num_return_vals=0 + 1, resources=actor_creation_resources, max_calls=0))
def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus, worker): """Export an actor to redis. Args: actor_id: The ID of the actor. actor_method_names (list): A list of the names of this actor's methods. num_cpus (int): The number of CPUs that this actor requires. num_gpus (int): The number of GPUs that this actor requires. """ ray.worker.check_main_thread() if worker.mode is None: raise Exception("Actors cannot be created before Ray has been started. " "You can start Ray with 'ray.init()'.") key = b"Actor:" + actor_id.id() # For now, all actor methods have 1 return value. driver_id = worker.task_driver_id.id() for actor_method_name in actor_method_names: # TODO(rkn): When we create a second actor, we are probably overwriting # the values from the first actor here. This may or may not be a problem. function_id = get_actor_method_function_id(actor_method_name).id() worker.function_properties[driver_id][function_id] = FunctionProperties( num_return_vals=1, num_cpus=1, num_gpus=0, max_calls=0) # Get a list of the local schedulers from the client table. client_table = ray.global_state.client_table() local_schedulers = [] for ip_address, clients in client_table.items(): for client in clients: if client["ClientType"] == "local_scheduler" and not client["Deleted"]: local_schedulers.append(client) # Select a local scheduler for the actor. local_scheduler_id = select_local_scheduler(local_schedulers, num_gpus, worker) assert local_scheduler_id is not None # We must put the actor information in Redis before publishing the actor # notification so that when the newly created actor attempts to fetch the # information from Redis, it is already there. worker.redis_client.hmset(key, {"class_id": class_id, "num_gpus": num_gpus}) # Really we should encode this message as a flatbuffer object. However, we're # having trouble getting that to work. It almost works, but in Python 2.7, # builder.CreateString fails on byte strings that contain characters outside # range(128). # TODO(rkn): There is actually no guarantee that the local scheduler that we # are publishing to has already subscribed to the actor_notifications # channel. Therefore, this message may be missed and the workload will hang. # This is a bug. worker.redis_client.publish("actor_notifications", actor_id.id() + driver_id + local_scheduler_id)
def export_actor(actor_id, class_id, actor_method_names, num_cpus, num_gpus, worker): """Export an actor to redis. Args: actor_id: The ID of the actor. actor_method_names (list): A list of the names of this actor's methods. num_cpus (int): The number of CPUs that this actor requires. num_gpus (int): The number of GPUs that this actor requires. """ ray.worker.check_main_thread() if worker.mode is None: raise Exception("Actors cannot be created before Ray has been " "started. You can start Ray with 'ray.init()'.") key = b"Actor:" + actor_id.id() # For now, all actor methods have 1 return value. driver_id = worker.task_driver_id.id() for actor_method_name in actor_method_names: # TODO(rkn): When we create a second actor, we are probably overwriting # the values from the first actor here. This may or may not be a # problem. function_id = get_actor_method_function_id(actor_method_name).id() worker.function_properties[driver_id][function_id] = ( FunctionProperties(num_return_vals=1, num_cpus=1, num_gpus=0, num_custom_resource=0, max_calls=0)) # Select a local scheduler for the actor. local_scheduler_id = select_local_scheduler( worker.task_driver_id.id(), ray.global_state.local_schedulers(), num_gpus, worker.redis_client) assert local_scheduler_id is not None # We must put the actor information in Redis before publishing the actor # notification so that when the newly created actor attempts to fetch the # information from Redis, it is already there. worker.redis_client.hmset( key, { "class_id": class_id, "driver_id": driver_id, "local_scheduler_id": local_scheduler_id, "num_gpus": num_gpus, "removed": False }) # TODO(rkn): There is actually no guarantee that the local scheduler that # we are publishing to has already subscribed to the actor_notifications # channel. Therefore, this message may be missed and the workload will # hang. This is a bug. ray.utils.publish_actor_creation(actor_id.id(), driver_id, local_scheduler_id, False, worker.redis_client)
def fetch_and_register_actor(actor_class_key, worker): """Import an actor. This will be called by the worker's import thread when the worker receives the actor_class export, assuming that the worker is an actor for that class. """ actor_id_str = worker.actor_id (driver_id, class_id, class_name, module, pickled_class, actor_method_names) = worker.redis_client.hmget( actor_class_key, ["driver_id", "class_id", "class_name", "module", "class", "actor_method_names"]) actor_name = class_name.decode("ascii") module = module.decode("ascii") actor_method_names = json.loads(actor_method_names.decode("ascii")) # Create a temporary actor with some temporary methods so that if the actor # fails to be unpickled, the temporary actor can be used (just to produce # error messages and to prevent the driver from hanging). class TemporaryActor(object): pass worker.actors[actor_id_str] = TemporaryActor() def temporary_actor_method(*xs): raise Exception("The actor with name {} failed to be imported, and so " "cannot execute this method".format(actor_name)) for actor_method_name in actor_method_names: function_id = get_actor_method_function_id(actor_method_name).id() worker.functions[driver_id][function_id] = (actor_method_name, temporary_actor_method) worker.function_properties[driver_id][function_id] = FunctionProperties( num_return_vals=1, num_cpus=1, num_gpus=0, max_calls=0) worker.num_task_executions[driver_id][function_id] = 0 try: unpickled_class = pickle.loads(pickled_class) except Exception: # If an exception was thrown when the actor was imported, we record the # traceback and notify the scheduler of the failure. traceback_str = ray.worker.format_error_message(traceback.format_exc()) # Log the error message. worker.push_error_to_driver(driver_id, "register_actor", traceback_str, data={"actor_id": actor_id_str}) else: # TODO(pcm): Why is the below line necessary? unpickled_class.__module__ = module worker.actors[actor_id_str] = unpickled_class.__new__(unpickled_class) for (k, v) in inspect.getmembers( unpickled_class, predicate=(lambda x: (inspect.isfunction(x) or inspect.ismethod(x)))): function_id = get_actor_method_function_id(k).id() worker.functions[driver_id][function_id] = (k, v)
def register_actor_signatures(worker, driver_id, class_name, actor_method_names): """Register an actor's method signatures in the worker. Args: worker: The worker to register the signatures on. driver_id: The ID of the driver that this actor is associated with. actor_id: The ID of the actor. actor_method_names: The names of the methods to register. """ for actor_method_name in actor_method_names: # TODO(rkn): When we create a second actor, we are probably overwriting # the values from the first actor here. This may or may not be a # problem. function_id = compute_actor_method_function_id(class_name, actor_method_name).id() # For now, all actor methods have 1 return value. worker.function_properties[driver_id][function_id] = ( FunctionProperties(num_return_vals=2, num_cpus=1, num_gpus=0, num_custom_resource=0, max_calls=0))
def fetch_and_register_actor(actor_class_key, worker): """Import an actor. This will be called by the worker's import thread when the worker receives the actor_class export, assuming that the worker is an actor for that class. """ actor_id_str = worker.actor_id (driver_id, class_id, class_name, module, pickled_class, checkpoint_interval, actor_method_names) = worker.redis_client.hmget( actor_class_key, ["driver_id", "class_id", "class_name", "module", "class", "checkpoint_interval", "actor_method_names"]) actor_name = class_name.decode("ascii") module = module.decode("ascii") checkpoint_interval = int(checkpoint_interval) actor_method_names = json.loads(actor_method_names.decode("ascii")) # Create a temporary actor with some temporary methods so that if the actor # fails to be unpickled, the temporary actor can be used (just to produce # error messages and to prevent the driver from hanging). class TemporaryActor(object): pass worker.actors[actor_id_str] = TemporaryActor() worker.actor_checkpoint_interval = checkpoint_interval def temporary_actor_method(*xs): raise Exception("The actor with name {} failed to be imported, and so " "cannot execute this method".format(actor_name)) for actor_method_name in actor_method_names: function_id = get_actor_method_function_id(actor_method_name).id() temporary_executor = make_actor_method_executor(worker, actor_method_name, temporary_actor_method) worker.functions[driver_id][function_id] = (actor_method_name, temporary_executor) worker.function_properties[driver_id][function_id] = ( FunctionProperties(num_return_vals=2, num_cpus=1, num_gpus=0, num_custom_resource=0, max_calls=0)) worker.num_task_executions[driver_id][function_id] = 0 try: unpickled_class = pickle.loads(pickled_class) worker.actor_class = unpickled_class except Exception: # If an exception was thrown when the actor was imported, we record the # traceback and notify the scheduler of the failure. traceback_str = ray.worker.format_error_message(traceback.format_exc()) # Log the error message. worker.push_error_to_driver(driver_id, "register_actor", traceback_str, data={"actor_id": actor_id_str}) # TODO(rkn): In the future, it might make sense to have the worker exit # here. However, currently that would lead to hanging if someone calls # ray.get on a method invoked on the actor. else: # TODO(pcm): Why is the below line necessary? unpickled_class.__module__ = module worker.actors[actor_id_str] = unpickled_class.__new__(unpickled_class) actor_methods = inspect.getmembers( unpickled_class, predicate=(lambda x: (inspect.isfunction(x) or inspect.ismethod(x)))) for actor_method_name, actor_method in actor_methods: function_id = get_actor_method_function_id(actor_method_name).id() executor = make_actor_method_executor(worker, actor_method_name, actor_method) worker.functions[driver_id][function_id] = (actor_method_name, executor) # We do not set worker.function_properties[driver_id][function_id] # because we currently do need the actor worker to submit new tasks # for the actor. # Store some extra information that will be used when the actor exits # to release GPU resources. worker.driver_id = binary_to_hex(driver_id) local_scheduler_id = worker.redis_client.hget( b"Actor:" + actor_id_str, "local_scheduler_id") worker.local_scheduler_id = binary_to_hex(local_scheduler_id)