Exemplo n.º 1
0
def request_resources(num_cpus: Optional[int] = None,
                      bundles: Optional[List[dict]] = None) -> None:
    """Remotely request some CPU or GPU resources from the autoscaler.

    This function is to be called e.g. on a node before submitting a bunch of
    ray.remote calls to ensure that resources rapidly become available.

    Args:
        num_cpus (int): Scale the cluster to ensure this number of CPUs are
            available. This request is persistent until another call to
            request_resources() is made.
        bundles (List[ResourceDict]): Scale the cluster to ensure this set of
            resource shapes can fit. This request is persistent until another
            call to request_resources() is made.
    """
    if not ray.is_initialized():
        raise RuntimeError("Ray is not initialized yet")
    to_request = []
    if num_cpus:
        to_request += [{"CPU": 1}] * num_cpus
    if bundles:
        to_request += bundles
    _internal_kv_put(
        AUTOSCALER_RESOURCE_REQUEST_CHANNEL,
        json.dumps(to_request),
        overwrite=True)
Exemplo n.º 2
0
    def _handle_failure(self, error):
        logger.exception("Error in monitor loop")
        if (
            self.autoscaler is not None
            and os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1"
        ):
            self.autoscaler.kill_workers()
            # Take down autoscaler workers if necessary.
            self.destroy_autoscaler_workers()

        # Something went wrong, so push an error to all current and future
        # drivers.
        message = f"The autoscaler failed with the following error:\n{error}"
        if _internal_kv_initialized():
            _internal_kv_put(
                ray_constants.DEBUG_AUTOSCALING_ERROR, message, overwrite=True
            )
        gcs_publisher = GcsPublisher(address=args.gcs_address)
        from ray._private.utils import publish_error_to_driver

        publish_error_to_driver(
            ray_constants.MONITOR_DIED_ERROR,
            message,
            gcs_publisher=gcs_publisher,
        )
Exemplo n.º 3
0
def register_actor(name, actor_handle):
    """Register a named actor under a string key.

   Args:
       name: The name of the named actor.
       actor_handle: The actor object to be associated with this name
   """
    if not isinstance(name, str):
        raise TypeError("The name argument must be a string.")
    if not isinstance(actor_handle, ray.actor.ActorHandle):
        raise TypeError("The actor_handle argument must be an ActorHandle "
                        "object.")
    actor_name = _calculate_key(name)

    # First check if the actor already exists.
    try:
        get_actor(name)
        exists = True
    except ValueError:
        exists = False

    if exists:
        raise ValueError("An actor with name={} already exists".format(name))

    # Add the actor to Redis if it does not already exist.
    _internal_kv_put(actor_name, pickle.dumps(actor_handle))
Exemplo n.º 4
0
    def _run(self):
        """Run the monitor loop."""

        while True:
            self.update_raylet_map()
            self.update_load_metrics()
            self.update_resource_requests()
            status = {
                "load_metrics_report": self.load_metrics.summary()._asdict()
            }

            # Process autoscaling actions
            if self.autoscaler:
                # Only used to update the load metrics for the autoscaler.
                self.autoscaler.update()
                status["autoscaler_report"] = self.autoscaler.summary(
                )._asdict()

            as_json = json.dumps(status)
            if _internal_kv_initialized():
                _internal_kv_put(DEBUG_AUTOSCALING_STATUS,
                                 as_json,
                                 overwrite=True)

            # Wait for a autoscaler update interval before processing the next
            # round of messages.
            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
Exemplo n.º 5
0
def connect_ray_pdb(host=None, port=None, patch_stdstreams=False, quiet=None):
    """
    Opens a remote PDB on first available port.
    """
    if host is None:
        host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1")
    if port is None:
        port = int(os.environ.get("REMOTE_PDB_PORT", "0"))
    if quiet is None:
        quiet = bool(os.environ.get("REMOTE_PDB_QUIET", ""))
    rdb = RemotePdb(host=host,
                    port=port,
                    patch_stdstreams=patch_stdstreams,
                    quiet=quiet)
    sockname = rdb._listen_socket.getsockname()
    pdb_address = "{}:{}".format(sockname[0], sockname[1])
    parentframeinfo = inspect.getouterframes(inspect.currentframe())[2]
    data = {
        "proctitle": setproctitle.getproctitle(),
        "pdb_address": pdb_address,
        "filename": parentframeinfo.filename,
        "lineno": parentframeinfo.lineno,
        "traceback": "\n".join(traceback.format_exception(*sys.exc_info()))
    }
    breakpoint_uuid = uuid.uuid4()
    _internal_kv_put("RAY_PDB_{}".format(breakpoint_uuid),
                     json.dumps(data),
                     overwrite=True)
    rdb.listen()
    _internal_kv_del("RAY_PDB_{}".format(breakpoint_uuid))

    return rdb
Exemplo n.º 6
0
    def _run(self):
        """Run the monitor loop."""
        while True:
            if self.stop_event and self.stop_event.is_set():
                break
            self.update_load_metrics()
            self.update_resource_requests()
            self.update_event_summary()
            status = {
                "load_metrics_report": self.load_metrics.summary()._asdict(),
                "time": time.time(),
                "monitor_pid": os.getpid()
            }

            # Process autoscaling actions
            if self.autoscaler:
                # Only used to update the load metrics for the autoscaler.
                self.autoscaler.update()
                status[
                    "autoscaler_report"] = self.autoscaler.summary()._asdict()

                for msg in self.event_summarizer.summary():
                    logger.info("{}{}".format(
                        ray_constants.LOG_PREFIX_EVENT_SUMMARY, msg))
                self.event_summarizer.clear()

            as_json = json.dumps(status)
            if _internal_kv_initialized():
                _internal_kv_put(
                    DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True)

            # Wait for a autoscaler update interval before processing the next
            # round of messages.
            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
Exemplo n.º 7
0
    def _handle_failure(self, error):
        logger.exception("Error in monitor loop")
        if self.autoscaler is not None and \
           os.environ.get("RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1":
            self.autoscaler.kill_workers()
            # Take down autoscaler workers if necessary.
            self.destroy_autoscaler_workers()

        # Something went wrong, so push an error to all current and future
        # drivers.
        message = f"The autoscaler failed with the following error:\n{error}"
        if _internal_kv_initialized():
            _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True)
        redis_client = ray._private.services.create_redis_client(
            self.redis_address, password=self.redis_password)
        gcs_publisher = None
        if args.gcs_address:
            gcs_publisher = GcsPublisher(address=args.gcs_address)
        elif gcs_pubsub_enabled():
            gcs_publisher = GcsPublisher(
                address=get_gcs_address_from_redis(redis_client))
        from ray._private.utils import publish_error_to_driver
        publish_error_to_driver(
            ray_constants.MONITOR_DIED_ERROR,
            message,
            redis_client=redis_client,
            gcs_publisher=gcs_publisher)
Exemplo n.º 8
0
 def crashed_get_node_id():
     if ray.get_runtime_context().node_id == crashed_worker_node_id:
         internal_kv._internal_kv_put("crashed_get_node_id",
                                      "crashed_worker_node_id")
         while True:
             time.sleep(1)
     else:
         return ray.get_runtime_context().node_id
Exemplo n.º 9
0
    def __init__(self, interval_s=1, total_steps=3):
        self.interval_s = interval_s
        self.stopped = False
        self.current_step = 1
        self.total_steps = total_steps

        worker = ray._private.worker.global_worker
        worker_id = worker.core_worker.get_actor_id()
        ray_kv._internal_kv_put(f"JOB:{worker_id}", self.current_step, overwrite=True)
Exemplo n.º 10
0
    def put_status(self, job_id: str, status: Union[JobStatus, JobStatusInfo]):
        if isinstance(status, JobStatus):
            status = JobStatusInfo(status=status)
        elif not isinstance(status, JobStatusInfo):
            assert False, "status must be JobStatus or JobStatusInfo."

        _internal_kv_put(self.JOB_STATUS_KEY.format(job_id=job_id),
                         pickle.dumps(status),
                         namespace=ray_constants.KV_NAMESPACE_JOB)
Exemplo n.º 11
0
    def _run(self):
        """Run the monitor loop."""
        while True:
            try:
                if self.stop_event and self.stop_event.is_set():
                    break
                self.update_load_metrics()
                self.update_resource_requests()
                self.update_event_summary()
                status = {
                    "load_metrics_report": asdict(self.load_metrics.summary()),
                    "time": time.time(),
                    "monitor_pid": os.getpid(),
                }

                if self.autoscaler and not self.load_metrics:
                    # load_metrics is Falsey iff we haven't collected any
                    # resource messages from the GCS, which can happen at startup if
                    # the GCS hasn't yet received data from the Raylets.
                    # In this case, do not do an autoscaler update.
                    # Wait to get load metrics.
                    logger.info(
                        "Autoscaler has not yet received load metrics. Waiting."
                    )
                elif self.autoscaler:
                    # Process autoscaling actions
                    self.autoscaler.update()
                    autoscaler_summary = self.autoscaler.summary()
                    if autoscaler_summary:
                        status["autoscaler_report"] = asdict(autoscaler_summary)

                    for msg in self.event_summarizer.summary():
                        # Need to prefix each line of the message for the lines to
                        # get pushed to the driver logs.
                        for line in msg.split("\n"):
                            logger.info(
                                "{}{}".format(
                                    ray_constants.LOG_PREFIX_EVENT_SUMMARY, line
                                )
                            )
                    self.event_summarizer.clear()

                as_json = json.dumps(status)
                if _internal_kv_initialized():
                    _internal_kv_put(
                        ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
                    )
            except Exception:
                # By default, do not exit the monitor on failure.
                if self.retry_on_failure:
                    logger.exception("Monitor: Execution exception. Trying again...")
                else:
                    raise

            # Wait for a autoscaler update interval before processing the next
            # round of messages.
            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
Exemplo n.º 12
0
def connect_ray_pdb(
    host=None,
    port=None,
    patch_stdstreams=False,
    quiet=None,
    breakpoint_uuid=None,
    debugger_external=False,
):
    """
    Opens a remote PDB on first available port.
    """
    if debugger_external:
        assert not host, "Cannot specify both host and debugger_external"
        host = "0.0.0.0"
    elif host is None:
        host = os.environ.get("REMOTE_PDB_HOST", "127.0.0.1")
    if port is None:
        port = int(os.environ.get("REMOTE_PDB_PORT", "0"))
    if quiet is None:
        quiet = bool(os.environ.get("REMOTE_PDB_QUIET", ""))
    if not breakpoint_uuid:
        breakpoint_uuid = uuid.uuid4().hex
    if debugger_external:
        ip_address = ray.worker.global_worker.node_ip_address
    else:
        ip_address = "localhost"
    rdb = RemotePdb(
        breakpoint_uuid=breakpoint_uuid,
        host=host,
        port=port,
        ip_address=ip_address,
        patch_stdstreams=patch_stdstreams,
        quiet=quiet,
    )
    sockname = rdb._listen_socket.getsockname()
    pdb_address = "{}:{}".format(ip_address, sockname[1])
    parentframeinfo = inspect.getouterframes(inspect.currentframe())[2]
    data = {
        "proctitle": setproctitle.getproctitle(),
        "pdb_address": pdb_address,
        "filename": parentframeinfo.filename,
        "lineno": parentframeinfo.lineno,
        "traceback": "\n".join(traceback.format_exception(*sys.exc_info())),
        "timestamp": time.time(),
        "job_id": ray.get_runtime_context().job_id.hex(),
    }
    _internal_kv_put(
        "RAY_PDB_{}".format(breakpoint_uuid),
        json.dumps(data),
        overwrite=True,
        namespace=ray_constants.KV_NAMESPACE_PDB,
    )
    rdb.listen()
    _internal_kv_del("RAY_PDB_{}".format(breakpoint_uuid),
                     namespace=ray_constants.KV_NAMESPACE_PDB)

    return rdb
Exemplo n.º 13
0
def _store_package_in_gcs(gcs_key: str, data: bytes) -> int:
    if len(data) >= GCS_STORAGE_MAX_SIZE:
        raise RuntimeError(
            "working_dir package exceeds the maximum size of 100MiB. You "
            "can exclude large files using the 'excludes' option to the "
            "runtime_env.")

    _internal_kv_put(gcs_key, data)
    return len(data)
Exemplo n.º 14
0
def _put_library_usage(library_usage: str):
    assert _internal_kv_initialized()
    try:
        _internal_kv_put(
            f"{usage_constant.LIBRARY_USAGE_PREFIX}{library_usage}",
            "",
            namespace=usage_constant.USAGE_STATS_NAMESPACE,
        )
    except Exception as e:
        logger.debug(f"Failed to put library usage, {e}")
Exemplo n.º 15
0
Arquivo: rpdb.py Projeto: ckw017/ray
 def do_remote(self, arg):
     """remote
     Skip into the next remote call.
     """
     # Tell the next task to drop into the debugger.
     ray.worker.global_worker.debugger_breakpoint = self._breakpoint_uuid
     # Tell the debug loop to connect to the next task.
     _internal_kv_put("RAY_PDB_CONTINUE_{}".format(self._breakpoint_uuid),
                      "")
     self.__restore()
     self.handle.connection.close()
     return Pdb.do_continue(self, arg)
Exemplo n.º 16
0
 def log_info_string(self, nodes):
     tmp = "Cluster status: "
     tmp += self.info_string(nodes)
     tmp += "\n"
     tmp += self.load_metrics.info_string()
     tmp += "\n"
     tmp += self.resource_demand_scheduler.debug_string(
         nodes, self.pending_launches.breakdown(),
         self.load_metrics.get_resource_utilization())
     if _internal_kv_initialized():
         _internal_kv_put(DEBUG_AUTOSCALING_STATUS, tmp, overwrite=True)
     logger.debug(tmp)
Exemplo n.º 17
0
    def put(self, key, val):
        """Put the key-value pair into the store.

        Args:
            key (str)
            val (bytes)
        """
        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))
        if not isinstance(val, bytes):
            raise TypeError("val must be bytes, got: {}.".format(type(val)))

        ray_kv._internal_kv_put(self._format_key(key), val, overwrite=True)
Exemplo n.º 18
0
def legacy_log_info_string(autoscaler, nodes):
    tmp = "Cluster status: "
    tmp += info_string(autoscaler, nodes)
    tmp += "\n"
    tmp += autoscaler.load_metrics.info_string()
    tmp += "\n"
    tmp += autoscaler.resource_demand_scheduler.debug_string(
        nodes,
        autoscaler.pending_launches.breakdown(),
        autoscaler.load_metrics.get_resource_utilization(),
    )
    if _internal_kv_initialized():
        _internal_kv_put(DEBUG_AUTOSCALING_STATUS_LEGACY, tmp, overwrite=True)
    logger.debug(tmp)
Exemplo n.º 19
0
 def update(self):
     try:
         self.reset(errors_fatal=False)
         self._update()
     except Exception as e:
         logger.exception("StandardAutoscaler: "
                          "Error during autoscaling.")
         if _internal_kv_initialized():
             _internal_kv_put(
                 DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True)
         self.num_failures += 1
         if self.num_failures > self.max_failures:
             logger.critical("StandardAutoscaler: "
                             "Too many errors, abort.")
             raise e
Exemplo n.º 20
0
 def do_remote(self, arg):
     """remote
     Skip into the next remote call.
     """
     # Tell the next task to drop into the debugger.
     ray.worker.global_worker.debugger_breakpoint = self._breakpoint_uuid
     # Tell the debug loop to connect to the next task.
     data = json.dumps({
         "job_id": ray.get_runtime_context().job_id.hex(),
     })
     _internal_kv_put("RAY_PDB_CONTINUE_{}".format(self._breakpoint_uuid),
                      data)
     self.__restore()
     self.handle.connection.close()
     return Pdb.do_continue(self, arg)
Exemplo n.º 21
0
    def put(self, key: str, val: bytes) -> bool:
        """Put the key-value pair into the store.

        Args:
            key (str)
            val (bytes)
        """
        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))
        if not isinstance(val, bytes):
            raise TypeError("val must be bytes, got: {}.".format(type(val)))

        ray_kv._internal_kv_put(self.get_storage_key(key),
                                val,
                                overwrite=True,
                                namespace=ray_constants.KV_NAMESPACE_SERVE)
Exemplo n.º 22
0
def _store_package_in_gcs(
        pkg_uri: str,
        data: bytes,
        logger: Optional[logging.Logger] = default_logger) -> int:
    file_size = len(data)
    size_str = _mib_string(file_size)
    if len(data) >= GCS_STORAGE_MAX_SIZE:
        raise RuntimeError(
            f"Package size ({size_str}) exceeds the maximum size of "
            f"{_mib_string(GCS_STORAGE_MAX_SIZE)}. You can exclude large "
            "files using the 'excludes' option to the runtime_env.")

    logger.info(f"Pushing file package '{pkg_uri}' ({size_str}) to "
                "Ray cluster...")
    _internal_kv_put(pkg_uri, data)
    logger.info(f"Successfully pushed file package '{pkg_uri}'.")
    return len(data)
Exemplo n.º 23
0
    def _handle_failure(self, error):
        logger.exception("Error in monitor loop")
        if self.autoscaler is not None:
            self.autoscaler.kill_workers()
            # Take down autoscaler workers if necessary.
            self.destroy_autoscaler_workers()

        # Something went wrong, so push an error to all current and future
        # drivers.
        message = f"The autoscaler failed with the following error:\n{error}"
        if _internal_kv_initialized():
            _internal_kv_put(DEBUG_AUTOSCALING_ERROR, message, overwrite=True)
        redis_client = ray._private.services.create_redis_client(
            args.redis_address, password=args.redis_password)
        from ray.utils import push_error_to_driver_through_redis
        push_error_to_driver_through_redis(redis_client,
                                           ray_constants.MONITOR_DIED_ERROR,
                                           message)
Exemplo n.º 24
0
    def _run(self):
        """Run the monitor loop."""
        while True:
            try:
                if self.stop_event and self.stop_event.is_set():
                    break
                self.update_load_metrics()
                self.update_resource_requests()
                self.update_event_summary()
                status = {
                    "load_metrics_report": asdict(self.load_metrics.summary()),
                    "time": time.time(),
                    "monitor_pid": os.getpid(),
                }

                # Process autoscaling actions
                if self.autoscaler:
                    # Only used to update the load metrics for the autoscaler.
                    self.autoscaler.update()
                    status["autoscaler_report"] = asdict(self.autoscaler.summary())

                    for msg in self.event_summarizer.summary():
                        # Need to prefix each line of the message for the lines to
                        # get pushed to the driver logs.
                        for line in msg.split("\n"):
                            logger.info(
                                "{}{}".format(
                                    ray_constants.LOG_PREFIX_EVENT_SUMMARY, line
                                )
                            )
                    self.event_summarizer.clear()

                as_json = json.dumps(status)
                if _internal_kv_initialized():
                    _internal_kv_put(
                        ray_constants.DEBUG_AUTOSCALING_STATUS, as_json, overwrite=True
                    )
            except Exception:
                logger.exception("Monitor: Execution exception. Trying again...")

            # Wait for a autoscaler update interval before processing the next
            # round of messages.
            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
Exemplo n.º 25
0
def _register_actor(name, actor_handle):
    if not isinstance(name, str):
        raise TypeError("The name argument must be a string.")
    if not isinstance(actor_handle, ray.actor.ActorHandle):
        raise TypeError("The actor_handle argument must be an ActorHandle "
                        "object.")
    actor_name = _calculate_key(name)

    # First check if the actor already exists.
    try:
        _get_actor(name)
        exists = True
    except ValueError:
        exists = False

    if exists:
        raise ValueError("An actor with name={} already exists".format(name))

    # Add the actor to Redis if it does not already exist.
    _internal_kv_put(actor_name, pickle.dumps(actor_handle), overwrite=True)
Exemplo n.º 26
0
    def run(self):
        worker = ray.worker.global_worker
        worker_id = worker.core_worker.get_actor_id()

        while self.current_step <= self.total_steps:
            if not self.stopped:
                print(f"Sleeping {self.interval_s} secs to executing "
                      f"step {self.current_step}")
                time.sleep(self.interval_s)
                self.current_step += 1
                ray_kv._internal_kv_put(f"JOB:{worker_id}",
                                        self.current_step,
                                        overwrite=True)
            else:
                print("Stop called or reached final step.")
                break

        self.stopped = True
        ray_kv._internal_kv_put(f"JOB:{worker_id}", "DONE", overwrite=True)
        return "DONE"
Exemplo n.º 27
0
 def update(self):
     try:
         self.reset(errors_fatal=False)
         self._update()
     except Exception as e:
         logger.exception("StandardAutoscaler: "
                          "Error during autoscaling.")
         if _internal_kv_initialized():
             _internal_kv_put(
                 DEBUG_AUTOSCALING_ERROR, str(e), overwrite=True)
         # Don't abort the autoscaler if the K8s API server is down.
         # https://github.com/ray-project/ray/issues/12255
         is_k8s_connection_error = (
             self.config["provider"]["type"] == "kubernetes"
             and isinstance(e, MaxRetryError))
         if not is_k8s_connection_error:
             self.num_failures += 1
         if self.num_failures > self.max_failures:
             logger.critical("StandardAutoscaler: "
                             "Too many errors, abort.")
             raise e
Exemplo n.º 28
0
def _store_package_in_gcs(
    pkg_uri: str,
    data: bytes,
    logger: Optional[logging.Logger] = default_logger,
) -> int:
    """Stores package data in the Global Control Store (GCS).

    Args:
        pkg_uri: The GCS key to store the data in.
        data: The serialized package's bytes to store in the GCS.
        logger (Optional[logging.Logger]): The logger used by this function.

    Return:
        int: Size of data

    Raises:
        RuntimeError: If the upload to the GCS fails.
        ValueError: If the data's size exceeds GCS_STORAGE_MAX_SIZE.
    """

    file_size = len(data)
    size_str = _mib_string(file_size)
    if len(data) >= GCS_STORAGE_MAX_SIZE:
        raise ValueError(
            f"Package size ({size_str}) exceeds the maximum size of "
            f"{_mib_string(GCS_STORAGE_MAX_SIZE)}. You can exclude large "
            "files using the 'excludes' option to the runtime_env."
        )

    logger.info(f"Pushing file package '{pkg_uri}' ({size_str}) to Ray cluster...")
    try:
        _internal_kv_put(pkg_uri, data)
    except Exception as e:
        raise RuntimeError(
            "Failed to store package in the GCS.\n"
            f"  - GCS URI: {pkg_uri}\n"
            f"  - Package data ({size_str}): {data[:15]}...\n"
        ) from e
    logger.info(f"Successfully pushed file package '{pkg_uri}'.")
    return len(data)
Exemplo n.º 29
0
def _put_library_usage(library_usage: str):
    assert _internal_kv_initialized()
    try:
        _internal_kv_put(
            f"{usage_constant.LIBRARY_USAGE_PREFIX}{library_usage}",
            "",
            namespace=usage_constant.USAGE_STATS_NAMESPACE,
        )
    except Exception as e:
        logger.debug(f"Failed to put library usage, {e}")

    # Record the library usage to the temp (e.g., /tmp/ray) folder.
    # Note that although we always write this file, it is not
    # reported when the usage stats is disabled.
    if ray._private.worker.global_worker.mode == ray.SCRIPT_MODE:
        try:
            lib_usage_recorder = LibUsageRecorder(
                ray._private.utils.get_ray_temp_dir())
            lib_usage_recorder.put_lib_usage(library_usage)
        except Exception as e:
            logger.debug(
                f"Failed to write a library usage to the home folder, {e}")
Exemplo n.º 30
0
    def _run(self):
        """Run the monitor.

        This function loops forever, checking for messages about dead database
        clients and cleaning up state accordingly.
        """

        self.subscribe(ray.ray_constants.AUTOSCALER_RESOURCE_REQUEST_CHANNEL)

        # Handle messages from the subscription channels.
        while True:
            self.update_raylet_map()
            self.update_load_metrics()
            status = {
                "load_metrics_report": self.load_metrics.summary()._asdict()
            }

            # Process autoscaling actions
            if self.autoscaler:
                # Only used to update the load metrics for the autoscaler.
                self.autoscaler.update()
                status["autoscaler_report"] = self.autoscaler.summary(
                )._asdict()

            as_json = json.dumps(status)
            if _internal_kv_initialized():
                _internal_kv_put(DEBUG_AUTOSCALING_STATUS,
                                 as_json,
                                 overwrite=True)

            # Process a round of messages.
            self.process_messages()

            # Wait for a autoscaler update interval before processing the next
            # round of messages.
            time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)
Exemplo n.º 31
0
def register_actor(name, actor_handle):
    """Register a named actor under a string key.

   Args:
       name: The name of the named actor.
       actor_handle: The actor object to be associated with this name
   """
    if not isinstance(name, str):
        raise TypeError("The name argument must be a string.")
    if not isinstance(actor_handle, ray.actor.ActorHandle):
        raise TypeError("The actor_handle argument must be an ActorHandle "
                        "object.")
    actor_name = _calculate_key(name)
    pickled_state = pickle.dumps(actor_handle)

    # Add the actor to Redis if it does not already exist.
    already_exists = _internal_kv_put(actor_name, pickled_state)
    if already_exists:
        # If the registration fails, then erase the new actor handle that
        # was added when pickling the actor handle.
        actor_handle._ray_new_actor_handles.pop()
        raise ValueError(
            "Error: the actor with name={} already exists".format(name))
Exemplo n.º 32
0
 def _ack_reads(self, offset):
     if self.max_size > 0:
         internal_kv._internal_kv_put(
             self.read_ack_key, offset, overwrite=True)
Exemplo n.º 33
0
 def flush_values(self):
     for (category, key), value in self._to_flush.items():
         _internal_kv_put(_make_key(category, key), value, overwrite=True)
     self._to_flush.clear()