def fetch_and_execute_function_to_run(self, key): """Run on arbitrary function on the worker.""" (job_id, serialized_function) = self._internal_kv_multiget( key, ["job_id", "function"]) if self.worker.mode == ray.SCRIPT_MODE: return if ray_constants.ISOLATE_EXPORTS and \ job_id != self.worker.current_job_id.binary(): return try: # FunctionActorManager may call pickle.loads at the same time. # Importing the same module in different threads causes deadlock. with self.worker.function_actor_manager.lock: # Deserialize the function. function = pickle.loads(serialized_function) # Run the function. function({"worker": self.worker}) except Exception: # If an exception was thrown when the function was run, we record # the traceback and notify the scheduler of the failure. traceback_str = traceback.format_exc() # Log the error message. ray._private.utils.push_error_to_driver( self.worker, ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, traceback_str, job_id=ray.JobID(job_id))
def error_messages(self, job_id=None): """Get the error messages for all drivers or a specific driver. Args: job_id: The specific job to get the errors for. If this is None, then this method retrieves the errors for all jobs. Returns: A list of the error messages for the specified driver if one was given, or a dictionary mapping from job ID to a list of error messages for that driver otherwise. """ self._check_connected() if job_id is not None: assert isinstance(job_id, ray.JobID) return self._error_messages(job_id) error_table_keys = self.redis_client.keys( gcs_utils.TablePrefix_ERROR_INFO_string + "*") job_ids = [ key[len(gcs_utils.TablePrefix_ERROR_INFO_string):] for key in error_table_keys ] return { binary_to_hex(job_id): self._error_messages(ray.JobID(job_id)) for job_id in job_ids }
def fetch_and_register_remote_function(self, key): """Import a remote function.""" (job_id_str, function_id_str, function_name, serialized_function, module, max_calls) = self._worker.redis_client.hmget(key, [ "job_id", "function_id", "function_name", "function", "module", "max_calls" ]) function_id = ray.FunctionID(function_id_str) job_id = ray.JobID(job_id_str) function_name = decode(function_name) max_calls = int(max_calls) module = decode(module) # This function is called by ImportThread. This operation needs to be # atomic. Otherwise, there is race condition. Another thread may use # the temporary function above before the real function is ready. with self.lock: self._num_task_executions[job_id][function_id] = 0 try: function = pickle.loads(serialized_function) except Exception: def f(*args, **kwargs): raise RuntimeError( "This function was not imported properly.") # Use a placeholder method when function pickled failed self._function_execution_info[job_id][function_id] = ( FunctionExecutionInfo(function=f, function_name=function_name, max_calls=max_calls)) # If an exception was thrown when the remote function was # imported, we record the traceback and notify the scheduler # of the failure. traceback_str = format_error_message(traceback.format_exc()) # Log the error message. push_error_to_driver( self._worker, ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, "Failed to unpickle the remote function '{}' with " "function ID {}. Traceback:\n{}".format( function_name, function_id.hex(), traceback_str), job_id=job_id) else: # The below line is necessary. Because in the driver process, # if the function is defined in the file where the python # script was started from, its module is `__main__`. # However in the worker process, the `__main__` module is a # different module, which is `default_worker.py` function.__module__ = module self._function_execution_info[job_id][function_id] = ( FunctionExecutionInfo(function=function, function_name=function_name, max_calls=max_calls)) # Add the function to the function table. self._worker.redis_client.rpush( b"FunctionTable:" + function_id.binary(), self._worker.worker_id)
def _load_actor_class_from_gcs(self, job_id, actor_creation_function_descriptor): """Load actor class from GCS.""" key = (b"ActorClass:" + job_id.binary() + b":" + actor_creation_function_descriptor.function_id.binary()) # Wait for the actor class key to have been imported by the # import thread. TODO(rkn): It shouldn't be possible to end # up in an infinite loop here, but we should push an error to # the driver if too much time is spent here. while key not in self.imported_actor_classes: try: # If we're in the process of deserializing an ActorHandle # and we hold the function_manager lock, we may be blocking # the import_thread from loading the actor class. Use cv.wait # to temporarily yield control to the import thread. self.cv.wait() except RuntimeError: # We don't hold the function_manager lock, just sleep regularly time.sleep(0.001) # Fetch raw data from GCS. vals = self._worker.gcs_client.internal_kv_get( key, KV_NAMESPACE_FUNCTION_TABLE) fields = [ "job_id", "class_name", "module", "class", "actor_method_names" ] if vals is None: vals = {} else: vals = pickle.loads(vals) (job_id_str, class_name, module, pickled_class, actor_method_names) = (vals.get(field) for field in fields) class_name = ensure_str(class_name) module_name = ensure_str(module) job_id = ray.JobID(job_id_str) actor_method_names = json.loads(ensure_str(actor_method_names)) actor_class = None try: with self.lock: actor_class = pickle.loads(pickled_class) except Exception: logger.debug("Failed to load actor class %s.", class_name) # If an exception was thrown when the actor was imported, we record # the traceback and notify the scheduler of the failure. traceback_str = format_error_message(traceback.format_exc()) # The actor class failed to be unpickled, create a fake actor # class instead (just to produce error messages and to prevent # the driver from hanging). actor_class = self._create_fake_actor_class( class_name, actor_method_names, traceback_str) # The below line is necessary. Because in the driver process, # if the function is defined in the file where the python script # was started from, its module is `__main__`. # However in the worker process, the `__main__` module is a # different module, which is `default_worker.py` actor_class.__module__ = module_name return actor_class
def _load_actor_class_from_gcs(self, job_id, function_descriptor): """Load actor class from GCS.""" key = (b"ActorClass:" + job_id.binary() + b":" + function_descriptor.function_id.binary()) # Wait for the actor class key to have been imported by the # import thread. TODO(rkn): It shouldn't be possible to end # up in an infinite loop here, but we should push an error to # the driver if too much time is spent here. while key not in self.imported_actor_classes: time.sleep(0.001) # Fetch raw data from GCS. (job_id_str, class_name, module, pickled_class, actor_method_names) = self._worker.redis_client.hmget( key, ["job_id", "class_name", "module", "class", "actor_method_names"]) class_name = ensure_str(class_name) module_name = ensure_str(module) job_id = ray.JobID(job_id_str) actor_method_names = json.loads(ensure_str(actor_method_names)) actor_class = None try: with self.lock: actor_class = pickle.loads(pickled_class) except Exception: logger.exception( "Failed to load actor class %s.".format(class_name)) # The actor class failed to be unpickled, create a fake actor # class instead (just to produce error messages and to prevent # the driver from hanging). actor_class = self._create_fake_actor_class( class_name, actor_method_names) # If an exception was thrown when the actor was imported, we record # the traceback and notify the scheduler of the failure. traceback_str = ray.utils.format_error_message( traceback.format_exc()) # Log the error message. push_error_to_driver( self._worker, ray_constants.REGISTER_ACTOR_PUSH_ERROR, "Failed to unpickle actor class '{}' for actor ID {}. " "Traceback:\n{}".format(class_name, self._worker.actor_id.hex(), traceback_str), job_id=job_id) # TODO(rkn): In the future, it might make sense to have the worker # exit here. However, currently that would lead to hanging if # someone calls ray.get on a method invoked on the actor. # The below line is necessary. Because in the driver process, # if the function is defined in the file where the python script # was started from, its module is `__main__`. # However in the worker process, the `__main__` module is a # different module, which is `default_worker.py` actor_class.__module__ = module_name return actor_class
def _load_actor_class_from_gcs(self, job_id, actor_creation_function_descriptor): """Load actor class from GCS.""" key = make_function_table_key( b"ActorClass", job_id, actor_creation_function_descriptor.function_id.binary(), ) # Fetch raw data from GCS. vals = self._worker.gcs_client.internal_kv_get( key, KV_NAMESPACE_FUNCTION_TABLE) fields = [ "job_id", "class_name", "module", "class", "actor_method_names" ] if vals is None: vals = {} else: vals = pickle.loads(vals) (job_id_str, class_name, module, pickled_class, actor_method_names) = (vals.get(field) for field in fields) class_name = ensure_str(class_name) module_name = ensure_str(module) job_id = ray.JobID(job_id_str) actor_method_names = json.loads(ensure_str(actor_method_names)) actor_class = None try: with self.lock: actor_class = pickle.loads(pickled_class) except Exception: logger.debug("Failed to load actor class %s.", class_name) # If an exception was thrown when the actor was imported, we record # the traceback and notify the scheduler of the failure. traceback_str = format_error_message(traceback.format_exc()) # The actor class failed to be unpickled, create a fake actor # class instead (just to produce error messages and to prevent # the driver from hanging). actor_class = self._create_fake_actor_class( class_name, actor_method_names, traceback_str) # The below line is necessary. Because in the driver process, # if the function is defined in the file where the python script # was started from, its module is `__main__`. # However in the worker process, the `__main__` module is a # different module, which is `default_worker.py` actor_class.__module__ = module_name return actor_class
def _job_table(self, job_id): """Fetch and parse the job table information for a single job ID. Args: job_id: A job ID or hex string to get information about. Returns: A dictionary with information about the job ID in question. """ # Allow the argument to be either a JobID or a hex string. if not isinstance(job_id, ray.JobID): assert isinstance(job_id, str) job_id = ray.JobID(hex_to_binary(job_id)) # Return information about a single job ID. message = self.redis_client.execute_command( "RAY.TABLE_LOOKUP", gcs_utils.TablePrefix.Value("JOB"), "", job_id.binary()) if message is None: return {} gcs_entry = gcs_utils.GcsEntry.FromString(message) assert len(gcs_entry.entries) > 0 job_info = {} for i in range(len(gcs_entry.entries)): entry = gcs_utils.JobTableData.FromString(gcs_entry.entries[i]) assert entry.job_id == job_id.binary() job_info["JobID"] = job_id.hex() job_info["NodeManagerAddress"] = entry.node_manager_address job_info["DriverPid"] = entry.driver_pid if entry.is_dead: job_info["StopTime"] = entry.timestamp else: job_info["StartTime"] = entry.timestamp return job_info
def compute_job_id_from_driver(driver_id): assert isinstance(driver_id, ray.WorkerID) return ray.JobID(driver_id.binary()[0:ray.JobID.size()])
def fetch_and_register_remote_function(self, key): """Import a remote function.""" vals = self._worker.gcs_client.internal_kv_get( key, KV_NAMESPACE_FUNCTION_TABLE) if vals is None: vals = {} else: vals = pickle.loads(vals) fields = [ "job_id", "function_id", "function_name", "function", "module", "max_calls", ] ( job_id_str, function_id_str, function_name, serialized_function, module, max_calls, ) = (vals.get(field) for field in fields) function_id = ray.FunctionID(function_id_str) job_id = ray.JobID(job_id_str) max_calls = int(max_calls) # This function is called by ImportThread. This operation needs to be # atomic. Otherwise, there is race condition. Another thread may use # the temporary function above before the real function is ready. with self.lock: self._num_task_executions[function_id] = 0 try: function = pickle.loads(serialized_function) except Exception: # If an exception was thrown when the remote function was # imported, we record the traceback and notify the scheduler # of the failure. traceback_str = format_error_message(traceback.format_exc()) def f(*args, **kwargs): raise RuntimeError( "The remote function failed to import on the " "worker. This may be because needed library " "dependencies are not installed in the worker " "environment:\n\n{}".format(traceback_str)) # Use a placeholder method when function pickled failed self._function_execution_info[ function_id] = FunctionExecutionInfo( function=f, function_name=function_name, max_calls=max_calls) # Log the error message. Log at DEBUG level to avoid overly # spamming the log on import failure. The user gets the error # via the RuntimeError message above. logger.debug("Failed to unpickle the remote function " f"'{function_name}' with " f"function ID {function_id.hex()}. " f"Job ID:{job_id}." f"Traceback:\n{traceback_str}. ") else: # The below line is necessary. Because in the driver process, # if the function is defined in the file where the python # script was started from, its module is `__main__`. # However in the worker process, the `__main__` module is a # different module, which is `default_worker.py` function.__module__ = module self._function_execution_info[ function_id] = FunctionExecutionInfo( function=function, function_name=function_name, max_calls=max_calls)
def fetch_and_register_remote_function(self, key): """Import a remote function.""" (job_id_str, function_id_str, function_name, serialized_function, module, max_calls) = self._worker.redis_client.hmget( key, [ "job_id", "function_id", "function_name", "function", "module", "max_calls" ]) if ray_constants.ISOLATE_EXPORTS and \ job_id_str != self._worker.current_job_id.binary(): # A worker only executes tasks from the assigned job. # TODO(jjyao): If fetching unrelated remote functions # becomes a perf issue, we can also consider having export # queue per job. return function_id = ray.FunctionID(function_id_str) job_id = ray.JobID(job_id_str) function_name = decode(function_name) max_calls = int(max_calls) module = decode(module) # This function is called by ImportThread. This operation needs to be # atomic. Otherwise, there is race condition. Another thread may use # the temporary function above before the real function is ready. with self.lock: self._num_task_executions[function_id] = 0 try: function = pickle.loads(serialized_function) except Exception: # If an exception was thrown when the remote function was # imported, we record the traceback and notify the scheduler # of the failure. traceback_str = format_error_message(traceback.format_exc()) def f(*args, **kwargs): raise RuntimeError( "The remote function failed to import on the " "worker. This may be because needed library " "dependencies are not installed in the worker " "environment:\n\n{}".format(traceback_str)) # Use a placeholder method when function pickled failed self._function_execution_info[function_id] = ( FunctionExecutionInfo( function=f, function_name=function_name, max_calls=max_calls)) # Log the error message. Log at DEBUG level to avoid overly # spamming the log on import failure. The user gets the error # via the RuntimeError message above. logger.debug("Failed to unpickle the remote function " f"'{function_name}' with " f"function ID {function_id.hex()}. " f"Job ID:{job_id}." f"Traceback:\n{traceback_str}. ") else: # The below line is necessary. Because in the driver process, # if the function is defined in the file where the python # script was started from, its module is `__main__`. # However in the worker process, the `__main__` module is a # different module, which is `default_worker.py` function.__module__ = module self._function_execution_info[function_id] = ( FunctionExecutionInfo( function=function, function_name=function_name, max_calls=max_calls)) # Add the function to the function table. self._worker.redis_client.rpush( b"FunctionTable:" + function_id.binary(), self._worker.worker_id)