Exemplo n.º 1
0
    def fetch_and_execute_function_to_run(self, key):
        """Run on arbitrary function on the worker."""
        (job_id, serialized_function) = self._internal_kv_multiget(
            key, ["job_id", "function"])

        if self.worker.mode == ray.SCRIPT_MODE:
            return

        if ray_constants.ISOLATE_EXPORTS and \
                job_id != self.worker.current_job_id.binary():
            return

        try:
            # FunctionActorManager may call pickle.loads at the same time.
            # Importing the same module in different threads causes deadlock.
            with self.worker.function_actor_manager.lock:
                # Deserialize the function.
                function = pickle.loads(serialized_function)
            # Run the function.
            function({"worker": self.worker})
        except Exception:
            # If an exception was thrown when the function was run, we record
            # the traceback and notify the scheduler of the failure.
            traceback_str = traceback.format_exc()
            # Log the error message.
            ray._private.utils.push_error_to_driver(
                self.worker,
                ray_constants.FUNCTION_TO_RUN_PUSH_ERROR,
                traceback_str,
                job_id=ray.JobID(job_id))
Exemplo n.º 2
0
    def error_messages(self, job_id=None):
        """Get the error messages for all drivers or a specific driver.

        Args:
            job_id: The specific job to get the errors for. If this is
                None, then this method retrieves the errors for all jobs.

        Returns:
            A list of the error messages for the specified driver if one was
                given, or a dictionary mapping from job ID to a list of error
                messages for that driver otherwise.
        """
        self._check_connected()

        if job_id is not None:
            assert isinstance(job_id, ray.JobID)
            return self._error_messages(job_id)

        error_table_keys = self.redis_client.keys(
            gcs_utils.TablePrefix_ERROR_INFO_string + "*")
        job_ids = [
            key[len(gcs_utils.TablePrefix_ERROR_INFO_string):]
            for key in error_table_keys
        ]

        return {
            binary_to_hex(job_id): self._error_messages(ray.JobID(job_id))
            for job_id in job_ids
        }
Exemplo n.º 3
0
    def fetch_and_register_remote_function(self, key):
        """Import a remote function."""
        (job_id_str, function_id_str, function_name, serialized_function,
         module,
         max_calls) = self._worker.redis_client.hmget(key, [
             "job_id", "function_id", "function_name", "function", "module",
             "max_calls"
         ])
        function_id = ray.FunctionID(function_id_str)
        job_id = ray.JobID(job_id_str)
        function_name = decode(function_name)
        max_calls = int(max_calls)
        module = decode(module)

        # This function is called by ImportThread. This operation needs to be
        # atomic. Otherwise, there is race condition. Another thread may use
        # the temporary function above before the real function is ready.
        with self.lock:
            self._num_task_executions[job_id][function_id] = 0

            try:
                function = pickle.loads(serialized_function)
            except Exception:

                def f(*args, **kwargs):
                    raise RuntimeError(
                        "This function was not imported properly.")

                # Use a placeholder method when function pickled failed
                self._function_execution_info[job_id][function_id] = (
                    FunctionExecutionInfo(function=f,
                                          function_name=function_name,
                                          max_calls=max_calls))
                # If an exception was thrown when the remote function was
                # imported, we record the traceback and notify the scheduler
                # of the failure.
                traceback_str = format_error_message(traceback.format_exc())
                # Log the error message.
                push_error_to_driver(
                    self._worker,
                    ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR,
                    "Failed to unpickle the remote function '{}' with "
                    "function ID {}. Traceback:\n{}".format(
                        function_name, function_id.hex(), traceback_str),
                    job_id=job_id)
            else:
                # The below line is necessary. Because in the driver process,
                # if the function is defined in the file where the python
                # script was started from, its module is `__main__`.
                # However in the worker process, the `__main__` module is a
                # different module, which is `default_worker.py`
                function.__module__ = module
                self._function_execution_info[job_id][function_id] = (
                    FunctionExecutionInfo(function=function,
                                          function_name=function_name,
                                          max_calls=max_calls))
                # Add the function to the function table.
                self._worker.redis_client.rpush(
                    b"FunctionTable:" + function_id.binary(),
                    self._worker.worker_id)
Exemplo n.º 4
0
    def _load_actor_class_from_gcs(self, job_id,
                                   actor_creation_function_descriptor):
        """Load actor class from GCS."""
        key = (b"ActorClass:" + job_id.binary() + b":" +
               actor_creation_function_descriptor.function_id.binary())
        # Wait for the actor class key to have been imported by the
        # import thread. TODO(rkn): It shouldn't be possible to end
        # up in an infinite loop here, but we should push an error to
        # the driver if too much time is spent here.
        while key not in self.imported_actor_classes:
            try:
                # If we're in the process of deserializing an ActorHandle
                # and we hold the function_manager lock, we may be blocking
                # the import_thread from loading the actor class. Use cv.wait
                # to temporarily yield control to the import thread.
                self.cv.wait()
            except RuntimeError:
                # We don't hold the function_manager lock, just sleep regularly
                time.sleep(0.001)

        # Fetch raw data from GCS.
        vals = self._worker.gcs_client.internal_kv_get(
            key, KV_NAMESPACE_FUNCTION_TABLE)
        fields = [
            "job_id", "class_name", "module", "class", "actor_method_names"
        ]
        if vals is None:
            vals = {}
        else:
            vals = pickle.loads(vals)
        (job_id_str, class_name, module, pickled_class,
         actor_method_names) = (vals.get(field) for field in fields)

        class_name = ensure_str(class_name)
        module_name = ensure_str(module)
        job_id = ray.JobID(job_id_str)
        actor_method_names = json.loads(ensure_str(actor_method_names))

        actor_class = None
        try:
            with self.lock:
                actor_class = pickle.loads(pickled_class)
        except Exception:
            logger.debug("Failed to load actor class %s.", class_name)
            # If an exception was thrown when the actor was imported, we record
            # the traceback and notify the scheduler of the failure.
            traceback_str = format_error_message(traceback.format_exc())
            # The actor class failed to be unpickled, create a fake actor
            # class instead (just to produce error messages and to prevent
            # the driver from hanging).
            actor_class = self._create_fake_actor_class(
                class_name, actor_method_names, traceback_str)

        # The below line is necessary. Because in the driver process,
        # if the function is defined in the file where the python script
        # was started from, its module is `__main__`.
        # However in the worker process, the `__main__` module is a
        # different module, which is `default_worker.py`
        actor_class.__module__ = module_name
        return actor_class
Exemplo n.º 5
0
    def _load_actor_class_from_gcs(self, job_id, function_descriptor):
        """Load actor class from GCS."""
        key = (b"ActorClass:" + job_id.binary() + b":" +
               function_descriptor.function_id.binary())
        # Wait for the actor class key to have been imported by the
        # import thread. TODO(rkn): It shouldn't be possible to end
        # up in an infinite loop here, but we should push an error to
        # the driver if too much time is spent here.
        while key not in self.imported_actor_classes:
            time.sleep(0.001)

        # Fetch raw data from GCS.
        (job_id_str, class_name, module, pickled_class,
         actor_method_names) = self._worker.redis_client.hmget(
             key,
             ["job_id", "class_name", "module", "class", "actor_method_names"])

        class_name = ensure_str(class_name)
        module_name = ensure_str(module)
        job_id = ray.JobID(job_id_str)
        actor_method_names = json.loads(ensure_str(actor_method_names))

        actor_class = None
        try:
            with self.lock:
                actor_class = pickle.loads(pickled_class)
        except Exception:
            logger.exception(
                "Failed to load actor class %s.".format(class_name))
            # The actor class failed to be unpickled, create a fake actor
            # class instead (just to produce error messages and to prevent
            # the driver from hanging).
            actor_class = self._create_fake_actor_class(
                class_name, actor_method_names)
            # If an exception was thrown when the actor was imported, we record
            # the traceback and notify the scheduler of the failure.
            traceback_str = ray.utils.format_error_message(
                traceback.format_exc())
            # Log the error message.
            push_error_to_driver(
                self._worker,
                ray_constants.REGISTER_ACTOR_PUSH_ERROR,
                "Failed to unpickle actor class '{}' for actor ID {}. "
                "Traceback:\n{}".format(class_name,
                                        self._worker.actor_id.hex(),
                                        traceback_str),
                job_id=job_id)
            # TODO(rkn): In the future, it might make sense to have the worker
            # exit here. However, currently that would lead to hanging if
            # someone calls ray.get on a method invoked on the actor.

        # The below line is necessary. Because in the driver process,
        # if the function is defined in the file where the python script
        # was started from, its module is `__main__`.
        # However in the worker process, the `__main__` module is a
        # different module, which is `default_worker.py`
        actor_class.__module__ = module_name
        return actor_class
Exemplo n.º 6
0
    def _load_actor_class_from_gcs(self, job_id,
                                   actor_creation_function_descriptor):
        """Load actor class from GCS."""
        key = make_function_table_key(
            b"ActorClass",
            job_id,
            actor_creation_function_descriptor.function_id.binary(),
        )

        # Fetch raw data from GCS.
        vals = self._worker.gcs_client.internal_kv_get(
            key, KV_NAMESPACE_FUNCTION_TABLE)
        fields = [
            "job_id", "class_name", "module", "class", "actor_method_names"
        ]
        if vals is None:
            vals = {}
        else:
            vals = pickle.loads(vals)
        (job_id_str, class_name, module, pickled_class,
         actor_method_names) = (vals.get(field) for field in fields)

        class_name = ensure_str(class_name)
        module_name = ensure_str(module)
        job_id = ray.JobID(job_id_str)
        actor_method_names = json.loads(ensure_str(actor_method_names))

        actor_class = None
        try:
            with self.lock:
                actor_class = pickle.loads(pickled_class)
        except Exception:
            logger.debug("Failed to load actor class %s.", class_name)
            # If an exception was thrown when the actor was imported, we record
            # the traceback and notify the scheduler of the failure.
            traceback_str = format_error_message(traceback.format_exc())
            # The actor class failed to be unpickled, create a fake actor
            # class instead (just to produce error messages and to prevent
            # the driver from hanging).
            actor_class = self._create_fake_actor_class(
                class_name, actor_method_names, traceback_str)

        # The below line is necessary. Because in the driver process,
        # if the function is defined in the file where the python script
        # was started from, its module is `__main__`.
        # However in the worker process, the `__main__` module is a
        # different module, which is `default_worker.py`
        actor_class.__module__ = module_name
        return actor_class
Exemplo n.º 7
0
    def _job_table(self, job_id):
        """Fetch and parse the job table information for a single job ID.

        Args:
            job_id: A job ID or hex string to get information about.

        Returns:
            A dictionary with information about the job ID in question.
        """
        # Allow the argument to be either a JobID or a hex string.
        if not isinstance(job_id, ray.JobID):
            assert isinstance(job_id, str)
            job_id = ray.JobID(hex_to_binary(job_id))

        # Return information about a single job ID.
        message = self.redis_client.execute_command(
            "RAY.TABLE_LOOKUP", gcs_utils.TablePrefix.Value("JOB"), "",
            job_id.binary())

        if message is None:
            return {}

        gcs_entry = gcs_utils.GcsEntry.FromString(message)

        assert len(gcs_entry.entries) > 0

        job_info = {}

        for i in range(len(gcs_entry.entries)):
            entry = gcs_utils.JobTableData.FromString(gcs_entry.entries[i])
            assert entry.job_id == job_id.binary()
            job_info["JobID"] = job_id.hex()
            job_info["NodeManagerAddress"] = entry.node_manager_address
            job_info["DriverPid"] = entry.driver_pid
            if entry.is_dead:
                job_info["StopTime"] = entry.timestamp
            else:
                job_info["StartTime"] = entry.timestamp

        return job_info
Exemplo n.º 8
0
def compute_job_id_from_driver(driver_id):
    assert isinstance(driver_id, ray.WorkerID)
    return ray.JobID(driver_id.binary()[0:ray.JobID.size()])
Exemplo n.º 9
0
    def fetch_and_register_remote_function(self, key):
        """Import a remote function."""
        vals = self._worker.gcs_client.internal_kv_get(
            key, KV_NAMESPACE_FUNCTION_TABLE)
        if vals is None:
            vals = {}
        else:
            vals = pickle.loads(vals)
        fields = [
            "job_id",
            "function_id",
            "function_name",
            "function",
            "module",
            "max_calls",
        ]
        (
            job_id_str,
            function_id_str,
            function_name,
            serialized_function,
            module,
            max_calls,
        ) = (vals.get(field) for field in fields)

        function_id = ray.FunctionID(function_id_str)
        job_id = ray.JobID(job_id_str)
        max_calls = int(max_calls)

        # This function is called by ImportThread. This operation needs to be
        # atomic. Otherwise, there is race condition. Another thread may use
        # the temporary function above before the real function is ready.
        with self.lock:
            self._num_task_executions[function_id] = 0

            try:
                function = pickle.loads(serialized_function)
            except Exception:

                # If an exception was thrown when the remote function was
                # imported, we record the traceback and notify the scheduler
                # of the failure.
                traceback_str = format_error_message(traceback.format_exc())

                def f(*args, **kwargs):
                    raise RuntimeError(
                        "The remote function failed to import on the "
                        "worker. This may be because needed library "
                        "dependencies are not installed in the worker "
                        "environment:\n\n{}".format(traceback_str))

                # Use a placeholder method when function pickled failed
                self._function_execution_info[
                    function_id] = FunctionExecutionInfo(
                        function=f,
                        function_name=function_name,
                        max_calls=max_calls)

                # Log the error message. Log at DEBUG level to avoid overly
                # spamming the log on import failure. The user gets the error
                # via the RuntimeError message above.
                logger.debug("Failed to unpickle the remote function "
                             f"'{function_name}' with "
                             f"function ID {function_id.hex()}. "
                             f"Job ID:{job_id}."
                             f"Traceback:\n{traceback_str}. ")
            else:
                # The below line is necessary. Because in the driver process,
                # if the function is defined in the file where the python
                # script was started from, its module is `__main__`.
                # However in the worker process, the `__main__` module is a
                # different module, which is `default_worker.py`
                function.__module__ = module
                self._function_execution_info[
                    function_id] = FunctionExecutionInfo(
                        function=function,
                        function_name=function_name,
                        max_calls=max_calls)
Exemplo n.º 10
0
    def fetch_and_register_remote_function(self, key):
        """Import a remote function."""
        (job_id_str, function_id_str, function_name, serialized_function,
         module, max_calls) = self._worker.redis_client.hmget(
             key, [
                 "job_id", "function_id", "function_name", "function",
                 "module", "max_calls"
             ])

        if ray_constants.ISOLATE_EXPORTS and \
                job_id_str != self._worker.current_job_id.binary():
            # A worker only executes tasks from the assigned job.
            # TODO(jjyao): If fetching unrelated remote functions
            # becomes a perf issue, we can also consider having export
            # queue per job.
            return

        function_id = ray.FunctionID(function_id_str)
        job_id = ray.JobID(job_id_str)
        function_name = decode(function_name)
        max_calls = int(max_calls)
        module = decode(module)

        # This function is called by ImportThread. This operation needs to be
        # atomic. Otherwise, there is race condition. Another thread may use
        # the temporary function above before the real function is ready.
        with self.lock:
            self._num_task_executions[function_id] = 0

            try:
                function = pickle.loads(serialized_function)
            except Exception:

                # If an exception was thrown when the remote function was
                # imported, we record the traceback and notify the scheduler
                # of the failure.
                traceback_str = format_error_message(traceback.format_exc())

                def f(*args, **kwargs):
                    raise RuntimeError(
                        "The remote function failed to import on the "
                        "worker. This may be because needed library "
                        "dependencies are not installed in the worker "
                        "environment:\n\n{}".format(traceback_str))

                # Use a placeholder method when function pickled failed
                self._function_execution_info[function_id] = (
                    FunctionExecutionInfo(
                        function=f,
                        function_name=function_name,
                        max_calls=max_calls))

                # Log the error message. Log at DEBUG level to avoid overly
                # spamming the log on import failure. The user gets the error
                # via the RuntimeError message above.
                logger.debug("Failed to unpickle the remote function "
                             f"'{function_name}' with "
                             f"function ID {function_id.hex()}. "
                             f"Job ID:{job_id}."
                             f"Traceback:\n{traceback_str}. ")
            else:
                # The below line is necessary. Because in the driver process,
                # if the function is defined in the file where the python
                # script was started from, its module is `__main__`.
                # However in the worker process, the `__main__` module is a
                # different module, which is `default_worker.py`
                function.__module__ = module
                self._function_execution_info[function_id] = (
                    FunctionExecutionInfo(
                        function=function,
                        function_name=function_name,
                        max_calls=max_calls))
                # Add the function to the function table.
                self._worker.redis_client.rpush(
                    b"FunctionTable:" + function_id.binary(),
                    self._worker.worker_id)