示例#1
0
文件: state.py 项目: yukoba/ray
    def error_messages(self, job_id=None):
        """Get the error messages for all jobs or a specific job.

        Args:
            job_id: The specific job to get the errors for. If this is None,
                then this method retrieves the errors for all jobs.

        Returns:
            A dictionary mapping job ID to a list of the error messages for
                that job.
        """
        if job_id is not None:
            assert isinstance(job_id, ray.DriverID)
            return self._error_messages(job_id)

        error_table_keys = self.redis_client.keys(
            ray.gcs_utils.TablePrefix_ERROR_INFO_string + "*")
        job_ids = [
            key[len(ray.gcs_utils.TablePrefix_ERROR_INFO_string):]
            for key in error_table_keys
        ]

        return {
            binary_to_hex(job_id): self._error_messages(ray.DriverID(job_id))
            for job_id in job_ids
        }
示例#2
0
文件: state.py 项目: songhappy/ray
    def error_messages(self, driver_id=None):
        """Get the error messages for all drivers or a specific driver.

        Args:
            driver_id: The specific driver to get the errors for. If this is
                None, then this method retrieves the errors for all drivers.

        Returns:
            A dictionary mapping driver ID to a list of the error messages for
                that driver.
        """
        self._check_connected()

        if driver_id is not None:
            assert isinstance(driver_id, ray.DriverID)
            return self._error_messages(driver_id)

        error_table_keys = self.redis_client.keys(
            ray.gcs_utils.TablePrefix_ERROR_INFO_string + "*")
        driver_ids = [
            key[len(ray.gcs_utils.TablePrefix_ERROR_INFO_string):]
            for key in error_table_keys
        ]

        return {
            binary_to_hex(driver_id): self._error_messages(
                ray.DriverID(driver_id))
            for driver_id in driver_ids
        }
示例#3
0
    def fetch_and_execute_function_to_run(self, key):
        """Run on arbitrary function on the worker."""
        (driver_id, serialized_function,
         run_on_other_drivers) = self.redis_client.hmget(
             key, ["driver_id", "function", "run_on_other_drivers"])

        if (utils.decode(run_on_other_drivers) == "False"
                and self.worker.mode == ray.SCRIPT_MODE
                and driver_id != self.worker.task_driver_id.binary()):
            return

        try:
            # Deserialize the function.
            function = pickle.loads(serialized_function)
            # Run the function.
            function({"worker": self.worker})
        except Exception:
            # If an exception was thrown when the function was run, we record
            # the traceback and notify the scheduler of the failure.
            traceback_str = traceback.format_exc()
            # Log the error message.
            utils.push_error_to_driver(
                self.worker,
                ray_constants.FUNCTION_TO_RUN_PUSH_ERROR,
                traceback_str,
                driver_id=ray.DriverID(driver_id))
示例#4
0
    def _load_actor_class_from_gcs(self, driver_id, function_descriptor):
        """Load actor class from GCS."""
        key = (b"ActorClass:" + driver_id.binary() + b":" +
               function_descriptor.function_id.binary())
        # Wait for the actor class key to have been imported by the
        # import thread. TODO(rkn): It shouldn't be possible to end
        # up in an infinite loop here, but we should push an error to
        # the driver if too much time is spent here.
        while key not in self.imported_actor_classes:
            time.sleep(0.001)

        # Fetch raw data from GCS.
        (driver_id_str, class_name, module, pickled_class,
         actor_method_names) = self._worker.redis_client.hmget(
             key, [
                 "driver_id", "class_name", "module", "class",
                 "actor_method_names"
             ])

        class_name = six.ensure_str(class_name)
        module_name = six.ensure_str(module)
        driver_id = ray.DriverID(driver_id_str)
        actor_method_names = json.loads(six.ensure_str(actor_method_names))

        actor_class = None
        try:
            with self._worker.lock:
                actor_class = pickle.loads(pickled_class)
        except Exception:
            logger.exception(
                "Failed to load actor class %s.".format(class_name))
            # The actor class failed to be unpickled, create a fake actor
            # class instead (just to produce error messages and to prevent
            # the driver from hanging).
            actor_class = self._create_fake_actor_class(
                class_name, actor_method_names)
            # If an exception was thrown when the actor was imported, we record
            # the traceback and notify the scheduler of the failure.
            traceback_str = ray.utils.format_error_message(
                traceback.format_exc())
            # Log the error message.
            push_error_to_driver(
                self._worker, ray_constants.REGISTER_ACTOR_PUSH_ERROR,
                "Failed to unpickle actor class '{}' for actor ID {}. "
                "Traceback:\n{}".format(class_name,
                                        self._worker.actor_id.hex(),
                                        traceback_str), driver_id)
            # TODO(rkn): In the future, it might make sense to have the worker
            # exit here. However, currently that would lead to hanging if
            # someone calls ray.get on a method invoked on the actor.

        # The below line is necessary. Because in the driver process,
        # if the function is defined in the file where the python script
        # was started from, its module is `__main__`.
        # However in the worker process, the `__main__` module is a
        # different module, which is `default_worker.py`
        actor_class.__module__ = module_name
        return actor_class
示例#5
0
    def fetch_and_register_remote_function(self, key):
        """Import a remote function."""
        (driver_id_str, function_id_str, function_name, serialized_function,
         num_return_vals, module, resources,
         max_calls) = self._worker.redis_client.hmget(key, [
             "driver_id", "function_id", "name", "function", "num_return_vals",
             "module", "resources", "max_calls"
         ])
        function_id = ray.FunctionID(function_id_str)
        driver_id = ray.DriverID(driver_id_str)
        function_name = decode(function_name)
        max_calls = int(max_calls)
        module = decode(module)

        # This is a placeholder in case the function can't be unpickled. This
        # will be overwritten if the function is successfully registered.
        def f():
            raise Exception("This function was not imported properly.")

        self._function_execution_info[driver_id][function_id] = (
            FunctionExecutionInfo(
                function=f, function_name=function_name, max_calls=max_calls))
        self._num_task_executions[driver_id][function_id] = 0

        try:
            function = pickle.loads(serialized_function)
        except Exception:
            # If an exception was thrown when the remote function was imported,
            # we record the traceback and notify the scheduler of the failure.
            traceback_str = format_error_message(traceback.format_exc())
            # Log the error message.
            push_error_to_driver(
                self._worker,
                ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR,
                traceback_str,
                driver_id=driver_id,
                data={
                    "function_id": function_id.binary(),
                    "function_name": function_name
                })
        else:
            # The below line is necessary. Because in the driver process,
            # if the function is defined in the file where the python script
            # was started from, its module is `__main__`.
            # However in the worker process, the `__main__` module is a
            # different module, which is `default_worker.py`
            function.__module__ = module
            self._function_execution_info[driver_id][function_id] = (
                FunctionExecutionInfo(
                    function=function,
                    function_name=function_name,
                    max_calls=max_calls))
            # Add the function to the function table.
            self._worker.redis_client.rpush(
                b"FunctionTable:" + function_id.binary(),
                self._worker.worker_id)
示例#6
0
    def fetch_and_register_actor(self, actor_class_key):
        """Import an actor.

        This will be called by the worker's import thread when the worker
        receives the actor_class export, assuming that the worker is an actor
        for that class.

        Args:
            actor_class_key: The key in Redis to use to fetch the actor.
        """
        actor_id = self._worker.actor_id
        (driver_id_str, class_name, module, pickled_class,
         actor_method_names) = self._worker.redis_client.hmget(
             actor_class_key, [
                 "driver_id", "class_name", "module", "class",
                 "actor_method_names"
             ])

        class_name = decode(class_name)
        module = decode(module)
        driver_id = ray.DriverID(driver_id_str)
        actor_method_names = json.loads(decode(actor_method_names))

        # In Python 2, json loads strings as unicode, so convert them back to
        # strings.
        if sys.version_info < (3, 0):
            actor_method_names = [
                method_name.encode("ascii")
                for method_name in actor_method_names
            ]

        # Create a temporary actor with some temporary methods so that if
        # the actor fails to be unpickled, the temporary actor can be used
        # (just to produce error messages and to prevent the driver from
        # hanging).
        class TemporaryActor(object):
            pass

        self._worker.actors[actor_id] = TemporaryActor()

        def temporary_actor_method(*xs):
            raise Exception(
                "The actor with name {} failed to be imported, "
                "and so cannot execute this method".format(class_name))

        # Register the actor method executors.
        for actor_method_name in actor_method_names:
            function_descriptor = FunctionDescriptor(module, actor_method_name,
                                                     class_name)
            function_id = function_descriptor.function_id
            temporary_executor = self._make_actor_method_executor(
                actor_method_name,
                temporary_actor_method,
                actor_imported=False)
            self._function_execution_info[driver_id][function_id] = (
                FunctionExecutionInfo(function=temporary_executor,
                                      function_name=actor_method_name,
                                      max_calls=0))
            self._num_task_executions[driver_id][function_id] = 0

        try:
            unpickled_class = pickle.loads(pickled_class)
            self._worker.actor_class = unpickled_class
        except Exception:
            # If an exception was thrown when the actor was imported, we record
            # the traceback and notify the scheduler of the failure.
            traceback_str = ray.utils.format_error_message(
                traceback.format_exc())
            # Log the error message.
            push_error_to_driver(
                self._worker, ray_constants.REGISTER_ACTOR_PUSH_ERROR,
                "Failed to unpickle actor class '{}' for actor ID {}. "
                "Traceback:\n{}".format(class_name, actor_id.hex(),
                                        traceback_str), driver_id)
            # TODO(rkn): In the future, it might make sense to have the worker
            # exit here. However, currently that would lead to hanging if
            # someone calls ray.get on a method invoked on the actor.
        else:
            # TODO(pcm): Why is the below line necessary?
            unpickled_class.__module__ = module
            self._worker.actors[actor_id] = unpickled_class.__new__(
                unpickled_class)

            actor_methods = inspect.getmembers(unpickled_class,
                                               predicate=is_function_or_method)
            for actor_method_name, actor_method in actor_methods:
                function_descriptor = FunctionDescriptor(
                    module, actor_method_name, class_name)
                function_id = function_descriptor.function_id
                executor = self._make_actor_method_executor(
                    actor_method_name, actor_method, actor_imported=True)
                self._function_execution_info[driver_id][function_id] = (
                    FunctionExecutionInfo(function=executor,
                                          function_name=actor_method_name,
                                          max_calls=0))