예제 #1
0
파일: head.py 프로젝트: nikitavemuri/ray
 async def _gcs_check_alive(self):
     try:
         self._gcs_check_alive_seq += 1
         request = gcs_service_pb2.CheckAliveRequest(
             seq=self._gcs_check_alive_seq)
         reply = await self._gcs_heartbeat_info_stub.CheckAlive(
             request, timeout=2)
         if reply.status.code != 0:
             raise Exception(
                 f"Failed to CheckAlive: {reply.status.message}")
         self._gcs_rpc_error_counter = 0
     except aiogrpc.AioRpcError:
         logger.exception(
             "Got AioRpcError when checking GCS is alive, seq=%s.",
             self._gcs_check_alive_seq)
         self._gcs_rpc_error_counter += 1
         if self._gcs_rpc_error_counter > \
                 dashboard_consts.GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR:
             logger.error(
                 "Dashboard suicide, the GCS RPC error count %s > %s",
                 self._gcs_rpc_error_counter,
                 dashboard_consts.GCS_CHECK_ALIVE_MAX_COUNT_OF_RPC_ERROR)
             # TODO(fyrestone): Do not use ray.state in
             # PrometheusServiceDiscoveryWriter.
             # Currently, we use os._exit() here to avoid hanging at the ray
             # shutdown(). Please refer to:
             # https://github.com/ray-project/ray/issues/16328
             os._exit(-1)
     except Exception:
         logger.exception("Error checking GCS is alive, seq=%s.",
                          self._gcs_check_alive_seq)
예제 #2
0
def check_health(address: str, timeout=2) -> bool:
    """Checks Ray cluster health, before / without actually connecting to the
    cluster via ray.init().

    Args:
        address: Ray cluster / GCS address string, e.g. ip:port.
        timeout: request timeout.
    Returns:
        Returns True if the cluster is running and has matching Ray version.
        Returns False if no service is running.
        Raises an exception otherwise.
    """
    req = gcs_service_pb2.CheckAliveRequest()
    try:
        channel = create_gcs_channel(address)
        stub = gcs_service_pb2_grpc.HeartbeatInfoGcsServiceStub(channel)
        resp = stub.CheckAlive(req, timeout=timeout)
    except grpc.RpcError:
        return False
    if resp.status.code != GcsCode.OK:
        raise RuntimeError(
            f"GCS running at {address} is unhealthy: {resp.status}")
    if resp.ray_version is None:
        resp.ray_version = "<= 1.12"
    if resp.ray_version != ray.__version__:
        raise RuntimeError(f"Ray cluster at {address} has version "
                           f"{resp.ray_version}, but this process is running "
                           f"Ray version {ray.__version__}.")
    return True
예제 #3
0
    async def check_alive(self,
                          node_ips: List[bytes],
                          timeout: Optional[float] = None) -> List[bool]:
        req = gcs_service_pb2.CheckAliveRequest(raylet_address=node_ips)
        reply = await self._heartbeat_info_stub.CheckAlive(req,
                                                           timeout=timeout)

        if reply.status.code != GcsCode.OK:
            raise RuntimeError(
                f"GCS running at {self._channel.address} is unhealthy: {reply.status}"
            )
        return list(reply.raylet_alive)
예제 #4
0
 async def _check_once(self) -> bool:
     request = gcs_service_pb2.CheckAliveRequest()
     try:
         reply = await self.gcs_heartbeat_info_stub.CheckAlive(
             request, timeout=dashboard_consts.GCS_CHECK_ALIVE_RPC_TIMEOUT)
         if reply.status.code != 0:
             logger.exception(
                 f"Failed to CheckAlive: {reply.status.message}")
             return False
     except aiogrpc.AioRpcError:  # Deadline Exceeded
         logger.exception("Got AioRpcError when checking GCS is alive")
         return False
     return True