def get(self, key: str) -> Optional[bytes]: """Get the value associated with the given key from the store. Args: key (str) Returns: The bytes value. If the key wasn't found, returns None. """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) try: response = self._s3.get_object(Bucket=self._bucket, Key=self.get_storage_key(key)) return response["Body"].read() except ClientError as e: if e.response["Error"]["Code"] == "NoSuchKey": logger.warning(f"No such key in s3 for key = {key}") return None else: message = e.response["Error"]["Message"] logger.error(f"Encountered ClientError while calling get() " f"in RayExternalKVStore: {message}") raise e
def _process_update(self, updates: Dict[str, UpdatedObject]): if isinstance(updates, (ray.exceptions.RayActorError)): # This can happen during shutdown where the controller is # intentionally killed, the client should just gracefully # exit. logger.debug("LongPollClient failed to connect to host. " "Shutting down.") return if isinstance(updates, (ray.exceptions.RayTaskError)): # This can happen during shutdown where the controller doesn't # contain this key, we will just repull. # NOTE(simon): should we repull or just wait in the long poll # host? if not isinstance(updates.as_instanceof_cause(), ValueError): logger.error("LongPollHost errored\n" + updates.traceback_str) self._poll_next() return # Before we process the updates and calling callbacks, kick off # another poll so we can pipeline the polling and processing. self._poll_next() logger.debug("LongPollClient received updates for keys: " f"{list(updates.keys())}.") for key, update in updates.items(): self.object_snapshots[key] = update.object_snapshot self.snapshot_ids[key] = update.snapshot_id callback = self.key_listeners[key] if self.event_loop is None: callback(update.object_snapshot) else: self.event_loop.call_soon_threadsafe( lambda: callback(update.object_snapshot))
def run_one_wrk_trial( trial_length: str, num_connections: int, http_host: str, http_port: str, endpoint: str = "", ) -> None: proc = subprocess.Popen( [ "wrk", "-c", str(num_connections), "-t", str(NUM_CPU_PER_NODE), "-d", trial_length, "--latency", f"http://{http_host}:{http_port}/{endpoint}", ], stdout=PIPE, stderr=PIPE, ) proc.wait() out, err = proc.communicate() if err.decode() != "": logger.error(err.decode()) return out.decode()
def _process_update(self, updates: Dict[str, UpdatedObject]): if isinstance(updates, (ray.exceptions.RayActorError)): # This can happen during shutdown where the controller is # intentionally killed, the client should just gracefully # exit. logger.debug("LongPollClient failed to connect to host. " "Shutting down.") return if isinstance(updates, (ray.exceptions.RayTaskError)): # Some error happened in the controller. It could be a bug or some # undesired state. logger.error("LongPollHost errored\n" + updates.traceback_str) self._poll_next() return logger.debug(f"LongPollClient {self} received updates for keys: " f"{list(updates.keys())}.") for key, update in updates.items(): self.object_snapshots[key] = update.object_snapshot self.snapshot_ids[key] = update.snapshot_id callback = self.key_listeners[key] # Bind the parameters because closures are late-binding. # https://docs.python-guide.org/writing/gotchas/#late-binding-closures # noqa: E501 def chained(callback=callback, arg=update.object_snapshot): callback(arg) self._on_callback_completed(trigger_at=len(updates)) if self.event_loop is None: chained() else: self.event_loop.call_soon_threadsafe(chained)
def _schedule_to_event_loop(self, callback): # Schedule the next iteration only if the loop is running. # The event loop might not be running if users used a cached # version across loops. if self.event_loop.is_running(): self.event_loop.call_soon_threadsafe(callback) else: logger.error("The event loop is closed, shutting down long poll client.") self.is_running = False
def _process_update(self, updates: Dict[str, UpdatedObject]): if isinstance(updates, (ray.exceptions.RayActorError)): # This can happen during shutdown where the controller is # intentionally killed, the client should just gracefully # exit. logger.debug("LongPollClient failed to connect to host. " "Shutting down.") self.is_running = False return if isinstance(updates, ConnectionError): logger.warning("LongPollClient connection failed, shutting down.") self.is_running = False return if isinstance(updates, (ray.exceptions.RayTaskError)): if isinstance(updates.as_instanceof_cause(), (asyncio.TimeoutError)): logger.debug("LongPollClient polling timed out. Retrying.") else: # Some error happened in the controller. It could be a bug or # some undesired state. logger.error("LongPollHost errored\n" + updates.traceback_str) self._poll_next() return logger.debug(f"LongPollClient {self} received updates for keys: " f"{list(updates.keys())}.") for key, update in updates.items(): self.object_snapshots[key] = update.object_snapshot self.snapshot_ids[key] = update.snapshot_id callback = self.key_listeners[key] # Bind the parameters because closures are late-binding. # https://docs.python-guide.org/writing/gotchas/#late-binding-closures # noqa: E501 def chained(callback=callback, arg=update.object_snapshot): callback(arg) self._on_callback_completed(trigger_at=len(updates)) if self.event_loop is None: chained() else: # Schedule the next iteration only if the loop is running. # The event loop might not be running if users used a cached # version across loops. if self.event_loop.is_running(): self.event_loop.call_soon_threadsafe(chained) else: logger.error( "The event loop is closed, shutting down long poll " "client.") self.is_running = False
async def run_control_loop(self) -> None: while True: async with self.write_lock: try: self.http_state.update() except Exception as e: logger.error(f"Exception updating HTTP state: {e}") try: self.backend_state.update() except Exception as e: logger.error(f"Exception updating backend state: {e}") await asyncio.sleep(CONTROL_LOOP_PERIOD_S)
def delete(self, key: str): """Delete the value associated with the given key from the store. Args: key (str) """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) try: self._s3.delete_object(Bucket=self._bucket, Key=self.get_storage_key(key)) except ClientError as e: message = e.response["Error"]["Message"] logger.error(f"Encountered ClientError while calling delete() " f"in RayExternalKVStore: {message}") raise e
def delete(self, key: str): """Delete the value associated with the given key from the store. Args: key (str) """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) try: blob_name = self.get_storage_key(key) blob = self._bucket.blob(blob_name=blob_name) blob.delete() except NotFound: logger.error(f"Encountered ClientError while calling delete() " f"in RayExternalKVStore - " f"Blob {blob_name} was not found!")
def _process_update(self, updates: Dict[str, UpdatedObject]): if isinstance(updates, (ray.exceptions.RayActorError)): # This can happen during shutdown where the controller is # intentionally killed, the client should just gracefully # exit. logger.debug("LongPollClient failed to connect to host. Shutting down.") self.is_running = False return if isinstance(updates, ConnectionError): logger.warning("LongPollClient connection failed, shutting down.") self.is_running = False return if isinstance(updates, (ray.exceptions.RayTaskError)): if isinstance(updates.as_instanceof_cause(), (asyncio.TimeoutError)): logger.debug("LongPollClient polling timed out. Retrying.") else: # Some error happened in the controller. It could be a bug or # some undesired state. logger.error("LongPollHost errored\n" + updates.traceback_str) # We must call this in event loop so it works in Ray Client. # See https://github.com/ray-project/ray/issues/20971 self._schedule_to_event_loop(self._poll_next) return logger.debug( f"LongPollClient {self} received updates for keys: " f"{list(updates.keys())}." ) for key, update in updates.items(): self.object_snapshots[key] = update.object_snapshot self.snapshot_ids[key] = update.snapshot_id callback = self.key_listeners[key] # Bind the parameters because closures are late-binding. # https://docs.python-guide.org/writing/gotchas/#late-binding-closures # noqa: E501 def chained(callback=callback, arg=update.object_snapshot): callback(arg) self._on_callback_completed(trigger_at=len(updates)) self._schedule_to_event_loop(chained)
def put(self, key: str, val: bytes) -> bool: """Put the key-value pair into the store. Args: key (str) val (bytes) """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) if not isinstance(val, bytes): raise TypeError("val must be bytes, got: {}.".format(type(val))) try: self._s3.put_object( Body=val, Bucket=self._bucket, Key=self.get_storage_key(key)) except ClientError as e: message = e.response["Error"]["Message"] logger.error(f"Encountered ClientError while calling put() " f"in RayExternalKVStore: {message}") raise e
def put(self, key: str, val: bytes) -> bool: """Put the key-value pair into the store. Args: key (str) val (bytes) """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) if not isinstance(val, bytes): raise TypeError("val must be bytes, got: {}.".format(type(val))) try: blob = self._bucket.blob(blob_name=self.get_storage_key(key)) f = io.BytesIO(val) blob.upload_from_file(f, num_retries=5) except Exception as e: message = str(e) logger.error(f"Encountered ClientError while calling put() " f"in RayExternalKVStore: {message}") raise e
def _scale_backend_replicas( self, backend_tag: BackendTag, num_replicas: int, ) -> bool: """Scale the given backend to the number of replicas. NOTE: this does not actually start or stop the replicas, but instead adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP. The caller is responsible for then first writing a checkpoint and then actually starting/stopping the intended replicas. This avoids inconsistencies with starting/stopping a replica and then crashing before writing a checkpoint. """ logger.debug("Scaling backend '{}' to {} replicas".format( backend_tag, num_replicas)) assert (backend_tag in self._backend_metadata ), "Backend {} is not registered.".format(backend_tag) assert num_replicas >= 0, ("Number of replicas must be" " greater than or equal to 0.") current_num_replicas = sum([ len(self._replicas[backend_tag][ReplicaState.SHOULD_START]), len(self._replicas[backend_tag][ReplicaState.STARTING]), len(self._replicas[backend_tag][ReplicaState.RUNNING]), ]) delta_num_replicas = num_replicas - current_num_replicas backend_info: BackendInfo = self._backend_metadata[backend_tag] if delta_num_replicas == 0: return False elif delta_num_replicas > 0: can_schedule = try_schedule_resources_on_nodes(requirements=[ backend_info.replica_config.resource_dict for _ in range(delta_num_replicas) ]) if _RESOURCE_CHECK_ENABLED and not all(can_schedule): num_possible = sum(can_schedule) logger.error( "Cannot scale backend {} to {} replicas. Ray Serve tried " "to add {} replicas but the resources only allows {} " "to be added. This is not a problem if the cluster is " "autoscaling. To fix this, consider scaling to replica to " "{} or add more resources to the cluster. You can check " "avaiable resources with ray.nodes().".format( backend_tag, num_replicas, delta_num_replicas, num_possible, current_num_replicas + num_possible)) logger.debug("Adding {} replicas to backend {}".format( delta_num_replicas, backend_tag)) for _ in range(delta_num_replicas): replica_tag = "{}#{}".format(backend_tag, get_random_letters()) self._replicas[backend_tag][ReplicaState.SHOULD_START].append( BackendReplica(self._controller_name, self._detached, replica_tag, backend_tag)) elif delta_num_replicas < 0: logger.debug("Removing {} replicas from backend '{}'".format( -delta_num_replicas, backend_tag)) assert self._target_replicas[backend_tag] >= delta_num_replicas for _ in range(-delta_num_replicas): replica_state_dict = self._replicas[backend_tag] list_to_use = replica_state_dict[ReplicaState.SHOULD_START] \ or replica_state_dict[ReplicaState.STARTING] \ or replica_state_dict[ReplicaState.RUNNING] assert len(list_to_use), replica_state_dict replica_to_stop = list_to_use.pop() graceful_timeout_s = (backend_info.backend_config. experimental_graceful_shutdown_timeout_s) replica_to_stop.set_should_stop(graceful_timeout_s) self._replicas[backend_tag][ReplicaState.SHOULD_STOP].append( replica_to_stop) return True