def __init__( self, namespace: str = None, ): if namespace is not None and not isinstance(namespace, str): raise TypeError("namespace must a string, got: {}.".format( type(namespace))) self.gcs_client = GcsClient( address=ray.get_runtime_context().gcs_address) self.timeout = RAY_SERVE_KV_TIMEOUT_S self.namespace = namespace or ""
def try_create_gcs_client( address: Optional[str], redis_password: Optional[str] ) -> Optional[GcsClient]: """ Try to create a gcs client based on the the command line args or by autodetecting a running Ray cluster. """ address = canonicalize_bootstrap_address(address) if use_gcs_for_bootstrap(): return GcsClient(address=address) else: if redis_password is None: redis_password = ray.ray_constants.REDIS_DEFAULT_PASSWORD return GcsClient.connect_to_gcs_by_redis_address(address, redis_password)
def serve_proxier( connection_str: str, address: Optional[str], *, redis_password: Optional[str] = None, session_dir: Optional[str] = None, runtime_env_agent_port: int = 0, ): # Initialize internal KV to be used to upload and download working_dir # before calling ray.init within the RayletServicers. # NOTE(edoakes): redis_address and redis_password should only be None in # tests. if use_gcs_for_bootstrap(): if address is not None: gcs_cli = GcsClient(address=address) ray.experimental.internal_kv._initialize_internal_kv(gcs_cli) else: if address is not None and redis_password is not None: gcs_cli = GcsClient.connect_to_gcs_by_redis_address( address, redis_password) ray.experimental.internal_kv._initialize_internal_kv(gcs_cli) server = grpc.server( futures.ThreadPoolExecutor(max_workers=CLIENT_SERVER_MAX_THREADS), options=GRPC_OPTIONS, ) proxy_manager = ProxyManager( address, session_dir=session_dir, redis_password=redis_password, runtime_env_agent_port=runtime_env_agent_port, ) task_servicer = RayletServicerProxy(None, proxy_manager) data_servicer = DataServicerProxy(proxy_manager) logs_servicer = LogstreamServicerProxy(proxy_manager) ray_client_pb2_grpc.add_RayletDriverServicer_to_server( task_servicer, server) ray_client_pb2_grpc.add_RayletDataStreamerServicer_to_server( data_servicer, server) ray_client_pb2_grpc.add_RayletLogStreamerServicer_to_server( logs_servicer, server) add_port_to_grpc_server(server, connection_str) server.start() return ClientServerHandle( task_servicer=task_servicer, data_servicer=data_servicer, logs_servicer=logs_servicer, grpc_server=server, )
def get_file_discovery_content(self): """Return the content for Prometheus service discovery.""" nodes = ray.nodes() metrics_export_addresses = [ "{}:{}".format(node["NodeManagerAddress"], node["MetricsExportPort"]) for node in nodes if node["alive"] is True ] gcs_client = GcsClient(address=self.gcs_address) autoscaler_addr = gcs_client.internal_kv_get(b"AutoscalerMetricsAddress", None) if autoscaler_addr: metrics_export_addresses.append(autoscaler_addr.decode("utf-8")) return json.dumps( [{"labels": {"job": "ray"}, "targets": metrics_export_addresses}] )
def __init__(self, group_name: str): self._group_name = group_name self._job_id = ray.get_runtime_context().job_id gcs_address = ray._private.worker._global_node.gcs_address self._gcs_client = GcsClient(address=gcs_address, nums_reconnect_retry=10) internal_kv._initialize_internal_kv(self._gcs_client)
def try_create_gcs_client( address: Optional[str], redis_password: Optional[str]) -> Optional[GcsClient]: """ Try to create a gcs client based on the the command line args or by autodetecting a running Ray cluster. """ address = canonicalize_bootstrap_address(address) return GcsClient(address=address)
def list_state_cli_group(ctx): address = services.canonicalize_bootstrap_address(None) gcs_client = GcsClient(address=address, nums_reconnect_retry=0) ray.experimental.internal_kv._initialize_internal_kv(gcs_client) api_server_url = ray._private.utils.internal_kv_get_with_retry( gcs_client, ray_constants.DASHBOARD_ADDRESS, namespace=ray_constants.KV_NAMESPACE_DASHBOARD, num_retries=20, ) if api_server_url is None: raise ValueError(( "Couldn't obtain the API server address from GCS. It is likely that " "the GCS server is down. Check gcs_server.[out | err] to see if it is " "still alive.")) assert use_gcs_for_bootstrap() ctx.ensure_object(dict) ctx.obj["api_server_url"] = f"http://{api_server_url.decode()}"
def __init__(self): gcs_address = ray.worker._global_node.gcs_address self._gcs_client = GcsClient(address=gcs_address, nums_reconnect_retry=0) internal_kv._initialize_internal_kv(self._gcs_client)
def __init__( self, address: str, autoscaling_config: Union[str, Callable[[], Dict[str, Any]]], redis_password: Optional[str] = None, prefix_cluster_info: bool = False, monitor_ip: Optional[str] = None, stop_event: Optional[Event] = None, retry_on_failure: bool = True, ): gcs_address = address options = (("grpc.enable_http_proxy", 0), ) gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, options) # TODO: Use gcs client for this self.gcs_node_resources_stub = ( gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel)) self.gcs_node_info_stub = gcs_service_pb2_grpc.NodeInfoGcsServiceStub( gcs_channel) if redis_password is not None: logger.warning("redis_password has been deprecated.") # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker gcs_client = GcsClient(address=gcs_address) if monitor_ip: monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" gcs_client.internal_kv_put(b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None) _initialize_internal_kv(gcs_client) if monitor_ip: monitor_addr = f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}" gcs_client.internal_kv_put(b"AutoscalerMetricsAddress", monitor_addr.encode(), True, None) worker.mode = 0 head_node_ip = gcs_address.split(":")[0] self.load_metrics = LoadMetrics() self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.retry_on_failure = retry_on_failure self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and # simply mirroring what the GCS tells us the cluster node types are. self.readonly_config = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip and prometheus_client: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( port=AUTOSCALER_METRIC_PORT, addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "", registry=self.prom_metrics.registry, ) except Exception: logger.exception( "An exception occurred while starting the metrics server.") elif not prometheus_client: logger.warning( "`prometheus_client` not found, so metrics will not be exported." ) logger.info("Monitor: Started")
class RayInternalKVStore(KVStoreBase): """Wraps ray's internal_kv with a namespace to avoid collisions. Supports string keys and bytes values, caller must handle serialization. """ def __init__( self, namespace: str = None, ): if namespace is not None and not isinstance(namespace, str): raise TypeError("namespace must a string, got: {}.".format( type(namespace))) self.gcs_client = GcsClient( address=ray.get_runtime_context().gcs_address) self.timeout = RAY_SERVE_KV_TIMEOUT_S self.namespace = namespace or "" def get_storage_key(self, key: str) -> str: return "{ns}-{key}".format(ns=self.namespace, key=key) def put(self, key: str, val: bytes) -> bool: """Put the key-value pair into the store. Args: key (str) val (bytes) """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) if not isinstance(val, bytes): raise TypeError("val must be bytes, got: {}.".format(type(val))) try: return self.gcs_client.internal_kv_put( self.get_storage_key(key).encode(), val, overwrite=True, namespace=ray_constants.KV_NAMESPACE_SERVE, timeout=self.timeout, ) except Exception as e: raise KVStoreError(e.code()) def get(self, key: str) -> Optional[bytes]: """Get the value associated with the given key from the store. Args: key (str) Returns: The bytes value. If the key wasn't found, returns None. """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) try: return self.gcs_client.internal_kv_get( self.get_storage_key(key).encode(), namespace=ray_constants.KV_NAMESPACE_SERVE, timeout=self.timeout, ) except Exception as e: raise KVStoreError(e.code()) def delete(self, key: str): """Delete the value associated with the given key from the store. Args: key (str) """ if not isinstance(key, str): raise TypeError("key must be a string, got: {}.".format(type(key))) try: return self.gcs_client.internal_kv_del( self.get_storage_key(key).encode(), False, namespace=ray_constants.KV_NAMESPACE_SERVE, timeout=self.timeout, ) except Exception as e: raise KVStoreError(e.code())
async def run(self): # Create an aioredis client for all modules. try: self.aioredis_client = await dashboard_utils.get_aioredis_client( self.redis_address, self.redis_password, dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS, dashboard_consts.RETRY_REDIS_CONNECTION_TIMES) except (socket.gaierror, ConnectionError): logger.error( "Dashboard head exiting: " "Failed to connect to redis at %s", self.redis_address) sys.exit(-1) # Create a http session for all modules. # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"): self.http_session = aiohttp.ClientSession( loop=asyncio.get_event_loop()) else: self.http_session = aiohttp.ClientSession() # Waiting for GCS is ready. gcs_address = await get_gcs_address_with_retry(self.aioredis_client) self.gcs_client = GcsClient(gcs_address) self.aiogrpc_gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, GRPC_CHANNEL_OPTIONS, asynchronous=True) self.health_check_thread = GCSHealthCheckThread(gcs_address) self.health_check_thread.start() # Start a grpc asyncio server. await self.server.start() async def _async_notify(): """Notify signals from queue.""" while True: co = await dashboard_utils.NotifyQueue.get() try: await co except Exception: logger.exception(f"Error notifying coroutine {co}") modules = self._load_modules() # Http server should be initialized after all modules loaded. # working_dir uploads for job submission can be up to 100MiB. app = aiohttp.web.Application(client_max_size=100 * 1024**2) app.add_routes(routes=routes.bound_routes()) runner = aiohttp.web.AppRunner(app) await runner.setup() last_ex = None for i in range(1 + self.http_port_retries): try: site = aiohttp.web.TCPSite(runner, self.http_host, self.http_port) await site.start() break except OSError as e: last_ex = e self.http_port += 1 logger.warning("Try to use port %s: %s", self.http_port, e) else: raise Exception(f"Failed to find a valid port for dashboard after " f"{self.http_port_retries} retries: {last_ex}") http_host, http_port, *_ = site._server.sockets[0].getsockname() http_host = self.ip if ipaddress.ip_address( http_host).is_unspecified else http_host logger.info("Dashboard head http address: %s:%s", http_host, http_port) # Write the dashboard head port to redis. await self.aioredis_client.set(ray_constants.REDIS_KEY_DASHBOARD, f"{http_host}:{http_port}") await self.aioredis_client.set( dashboard_consts.REDIS_KEY_DASHBOARD_RPC, f"{self.ip}:{self.grpc_port}") # Dump registered http routes. dump_routes = [ r for r in app.router.routes() if r.method != hdrs.METH_HEAD ] for r in dump_routes: logger.info(r) logger.info("Registered %s routes.", len(dump_routes)) # Freeze signal after all modules loaded. dashboard_utils.SignalManager.freeze() concurrent_tasks = [ self._gcs_check_alive(), _async_notify(), DataOrganizer.purge(), DataOrganizer.organize(), ] await asyncio.gather(*concurrent_tasks, *(m.run(self.server) for m in modules)) await self.server.wait_for_termination()
async def run(self): async def _check_parent(): """Check if raylet is dead and fate-share if it is.""" try: curr_proc = psutil.Process() while True: parent = curr_proc.parent() if parent is None or parent.pid == 1 or self.ppid != parent.pid: logger.error("Raylet is dead, exiting.") sys.exit(0) await asyncio.sleep( dashboard_consts.DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS ) except Exception: logger.error("Failed to check parent PID, exiting.") sys.exit(1) if sys.platform not in ["win32", "cygwin"]: check_parent_task = create_task(_check_parent()) # Start a grpc asyncio server. if self.server: await self.server.start() self.gcs_client = GcsClient(address=self.gcs_address) modules = self._load_modules() # Setup http server if necessary. if not self.minimal: # If the agent is not minimal it should start the http server # to communicate with the dashboard in a head node. # Http server is not started in the minimal version because # it requires additional dependencies that are not # included in the minimal ray package. try: self.http_server = await self._configure_http_server(modules) except Exception: # TODO(SongGuyang): Catch the exception here because there is # port conflict issue which brought from static port. We should # remove this after we find better port resolution. logger.exception( "Failed to start http server. Agent will stay alive but " "disable the http service." ) # Write the dashboard agent port to kv. # TODO: Use async version if performance is an issue # -1 should indicate that http server is not started. http_port = -1 if not self.http_server else self.http_server.http_port internal_kv._internal_kv_put( f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}", json.dumps([http_port, self.grpc_port]), namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ) # Register agent to agent manager. raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub( self.aiogrpc_raylet_channel ) await raylet_stub.RegisterAgent( agent_manager_pb2.RegisterAgentRequest( agent_id=self.agent_id, agent_port=self.grpc_port, agent_ip_address=self.ip, ) ) tasks = [m.run(self.server) for m in modules] if sys.platform not in ["win32", "cygwin"]: tasks.append(check_parent_task) await asyncio.gather(*tasks) await self.server.wait_for_termination() if self.http_server: await self.http_server.cleanup()
async def run(self): # Create a http session for all modules. # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"): self.http_session = aiohttp.ClientSession( loop=asyncio.get_event_loop()) else: self.http_session = aiohttp.ClientSession() gcs_address = await self.get_gcs_address() # Dashboard will handle connection failure automatically self.gcs_client = GcsClient(address=gcs_address, nums_reconnect_retry=0) internal_kv._initialize_internal_kv(self.gcs_client) self.aiogrpc_gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, GRPC_CHANNEL_OPTIONS, asynchronous=True) if gcs_pubsub_enabled(): self.gcs_error_subscriber = GcsAioErrorSubscriber( address=gcs_address) self.gcs_log_subscriber = GcsAioLogSubscriber(address=gcs_address) await self.gcs_error_subscriber.subscribe() await self.gcs_log_subscriber.subscribe() self.health_check_thread = GCSHealthCheckThread(gcs_address) self.health_check_thread.start() # Start a grpc asyncio server. await self.server.start() async def _async_notify(): """Notify signals from queue.""" while True: co = await dashboard_utils.NotifyQueue.get() try: await co except Exception: logger.exception(f"Error notifying coroutine {co}") modules = self._load_modules() # Http server should be initialized after all modules loaded. # working_dir uploads for job submission can be up to 100MiB. app = aiohttp.web.Application(client_max_size=100 * 1024**2) app.add_routes(routes=routes.bound_routes()) runner = aiohttp.web.AppRunner(app) await runner.setup() last_ex = None for i in range(1 + self.http_port_retries): try: site = aiohttp.web.TCPSite(runner, self.http_host, self.http_port) await site.start() break except OSError as e: last_ex = e self.http_port += 1 logger.warning("Try to use port %s: %s", self.http_port, e) else: raise Exception(f"Failed to find a valid port for dashboard after " f"{self.http_port_retries} retries: {last_ex}") http_host, http_port, *_ = site._server.sockets[0].getsockname() http_host = self.ip if ipaddress.ip_address( http_host).is_unspecified else http_host logger.info("Dashboard head http address: %s:%s", http_host, http_port) # TODO: Use async version if performance is an issue # Write the dashboard head port to gcs kv. internal_kv._internal_kv_put( ray_constants.DASHBOARD_ADDRESS, f"{http_host}:{http_port}", namespace=ray_constants.KV_NAMESPACE_DASHBOARD) internal_kv._internal_kv_put( dashboard_consts.DASHBOARD_RPC_ADDRESS, f"{self.ip}:{self.grpc_port}", namespace=ray_constants.KV_NAMESPACE_DASHBOARD) # Dump registered http routes. dump_routes = [ r for r in app.router.routes() if r.method != hdrs.METH_HEAD ] for r in dump_routes: logger.info(r) logger.info("Registered %s routes.", len(dump_routes)) # Freeze signal after all modules loaded. dashboard_utils.SignalManager.freeze() concurrent_tasks = [ self._gcs_check_alive(), _async_notify(), DataOrganizer.purge(), DataOrganizer.organize(), ] await asyncio.gather(*concurrent_tasks, *(m.run(self.server) for m in modules)) await self.server.wait_for_termination()
async def run(self): async def _check_parent(): """Check if raylet is dead and fate-share if it is.""" try: curr_proc = psutil.Process() while True: parent = curr_proc.parent() if parent is None or parent.pid == 1 or self.ppid != parent.pid: logger.error("Raylet is dead, exiting.") sys.exit(0) await asyncio.sleep( dashboard_consts. DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS) except Exception: logger.error("Failed to check parent PID, exiting.") sys.exit(1) if sys.platform not in ["win32", "cygwin"]: check_parent_task = create_task(_check_parent()) if not use_gcs_for_bootstrap(): # Create an aioredis client for all modules. try: self.aioredis_client = await dashboard_utils.get_aioredis_client( self.redis_address, self.redis_password, dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS, dashboard_consts.RETRY_REDIS_CONNECTION_TIMES, ) except (socket.gaierror, ConnectionRefusedError): logger.error( "Dashboard agent exiting: " "Failed to connect to redis at %s", self.redis_address, ) sys.exit(-1) # Start a grpc asyncio server. await self.server.start() if not use_gcs_for_bootstrap(): gcs_address = await self.aioredis_client.get( dashboard_consts.GCS_SERVER_ADDRESS) self.gcs_client = GcsClient(address=gcs_address.decode()) else: self.gcs_client = GcsClient(address=self.gcs_address) modules = self._load_modules() # Setup http server if necessary. if not self.minimal: # If the agent is not minimal it should start the http server # to communicate with the dashboard in a head node. # Http server is not started in the minimal version because # it requires additional dependencies that are not # included in the minimal ray package. self.http_server = await self._configure_http_server(modules) # Write the dashboard agent port to redis. # TODO: Use async version if performance is an issue # -1 should indicate that http server is not started. http_port = -1 if not self.http_server else self.http_server.http_port internal_kv._internal_kv_put( f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}", json.dumps([http_port, self.grpc_port]), namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ) # Register agent to agent manager. raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub( self.aiogrpc_raylet_channel) await raylet_stub.RegisterAgent( agent_manager_pb2.RegisterAgentRequest( agent_pid=os.getpid(), agent_port=self.grpc_port, agent_ip_address=self.ip, )) tasks = [m.run(self.server) for m in modules] if sys.platform not in ["win32", "cygwin"]: tasks.append(check_parent_task) await asyncio.gather(*tasks) await self.server.wait_for_termination() if self.http_server: await self.http_server.cleanup()
async def run(self): async def _check_parent(): """Check if raylet is dead and fate-share if it is.""" try: curr_proc = psutil.Process() while True: parent = curr_proc.parent() if (parent is None or parent.pid == 1 or self.ppid != parent.pid): logger.error("Raylet is dead, exiting.") sys.exit(0) await asyncio.sleep( dashboard_consts. DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS) except Exception: logger.error("Failed to check parent PID, exiting.") sys.exit(1) if sys.platform not in ["win32", "cygwin"]: check_parent_task = create_task(_check_parent()) if not use_gcs_for_bootstrap(): # Create an aioredis client for all modules. try: self.aioredis_client = \ await dashboard_utils.get_aioredis_client( self.redis_address, self.redis_password, dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS, dashboard_consts.RETRY_REDIS_CONNECTION_TIMES) except (socket.gaierror, ConnectionRefusedError): logger.error( "Dashboard agent exiting: " "Failed to connect to redis at %s", self.redis_address) sys.exit(-1) # Create a http session for all modules. # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"): self.http_session = aiohttp.ClientSession( loop=asyncio.get_event_loop()) else: self.http_session = aiohttp.ClientSession() # Start a grpc asyncio server. await self.server.start() if not use_gcs_for_bootstrap(): gcs_address = await self.aioredis_client.get( dashboard_consts.GCS_SERVER_ADDRESS) self.gcs_client = GcsClient(address=gcs_address.decode()) else: self.gcs_client = GcsClient(address=self.gcs_address) modules = self._load_modules() # Http server should be initialized after all modules loaded. app = aiohttp.web.Application() app.add_routes(routes=routes.bound_routes()) # Enable CORS on all routes. cors = aiohttp_cors.setup(app, defaults={ "*": aiohttp_cors.ResourceOptions( allow_credentials=True, expose_headers="*", allow_methods="*", allow_headers=("Content-Type", "X-Header"), ) }) for route in list(app.router.routes()): cors.add(route) runner = aiohttp.web.AppRunner(app) await runner.setup() site = aiohttp.web.TCPSite( runner, "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0", self.listen_port) await site.start() http_host, http_port, *_ = site._server.sockets[0].getsockname() logger.info("Dashboard agent http address: %s:%s", http_host, http_port) # Dump registered http routes. dump_routes = [ r for r in app.router.routes() if r.method != hdrs.METH_HEAD ] for r in dump_routes: logger.info(r) logger.info("Registered %s routes.", len(dump_routes)) # Write the dashboard agent port to redis. # TODO: Use async version if performance is an issue internal_kv._internal_kv_put( f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}", json.dumps([http_port, self.grpc_port]), namespace=ray_constants.KV_NAMESPACE_DASHBOARD) # Register agent to agent manager. raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub( self.aiogrpc_raylet_channel) await raylet_stub.RegisterAgent( agent_manager_pb2.RegisterAgentRequest(agent_pid=os.getpid(), agent_port=self.grpc_port, agent_ip_address=self.ip)) tasks = [m.run(self.server) for m in modules] if sys.platform not in ["win32", "cygwin"]: tasks.append(check_parent_task) await asyncio.gather(*tasks) await self.server.wait_for_termination() # Wait for finish signal. await runner.cleanup()
async def run(self): gcs_address = self.gcs_address # Dashboard will handle connection failure automatically self.gcs_client = GcsClient(address=gcs_address, nums_reconnect_retry=0) internal_kv._initialize_internal_kv(self.gcs_client) self.aiogrpc_gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, GRPC_CHANNEL_OPTIONS, asynchronous=True) self.gcs_error_subscriber = GcsAioErrorSubscriber(address=gcs_address) self.gcs_log_subscriber = GcsAioLogSubscriber(address=gcs_address) await self.gcs_error_subscriber.subscribe() await self.gcs_log_subscriber.subscribe() self.health_check_thread = GCSHealthCheckThread(gcs_address) self.health_check_thread.start() # Start a grpc asyncio server. await self.server.start() async def _async_notify(): """Notify signals from queue.""" while True: co = await dashboard_utils.NotifyQueue.get() try: await co except Exception: logger.exception(f"Error notifying coroutine {co}") modules = self._load_modules() http_host, http_port = self.http_host, self.http_port if not self.minimal: self.http_server = await self._configure_http_server(modules) http_host, http_port = self.http_server.get_address() internal_kv._internal_kv_put( ray_constants.DASHBOARD_ADDRESS, f"{http_host}:{http_port}", namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ) # TODO: Use async version if performance is an issue # Write the dashboard head port to gcs kv. internal_kv._internal_kv_put( dashboard_consts.DASHBOARD_RPC_ADDRESS, f"{self.ip}:{self.grpc_port}", namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ) # Freeze signal after all modules loaded. dashboard_utils.SignalManager.freeze() concurrent_tasks = [ self._gcs_check_alive(), _async_notify(), DataOrganizer.purge(), DataOrganizer.organize(), ] await asyncio.gather(*concurrent_tasks, *(m.run(self.server) for m in modules)) await self.server.wait_for_termination() if self.http_server: await self.http_server.cleanup()
def __init__( self, node_ip_address, dashboard_agent_port, gcs_address, minimal, temp_dir=None, session_dir=None, runtime_env_dir=None, log_dir=None, metrics_export_port=None, node_manager_port=None, listen_port=0, object_store_name=None, raylet_name=None, logging_params=None, disable_metrics_collection: bool = False, ): """Initialize the DashboardAgent object.""" # Public attributes are accessible for all agent modules. self.ip = node_ip_address self.minimal = minimal assert gcs_address is not None self.gcs_address = gcs_address self.temp_dir = temp_dir self.session_dir = session_dir self.runtime_env_dir = runtime_env_dir self.log_dir = log_dir self.dashboard_agent_port = dashboard_agent_port self.metrics_export_port = metrics_export_port self.node_manager_port = node_manager_port self.listen_port = listen_port self.object_store_name = object_store_name self.raylet_name = raylet_name self.logging_params = logging_params self.node_id = os.environ["RAY_NODE_ID"] self.metrics_collection_disabled = disable_metrics_collection # TODO(edoakes): RAY_RAYLET_PID isn't properly set on Windows. This is # only used for fate-sharing with the raylet and we need a different # fate-sharing mechanism for Windows anyways. if sys.platform not in ["win32", "cygwin"]: self.ppid = int(os.environ["RAY_RAYLET_PID"]) assert self.ppid > 0 logger.info("Parent pid is %s", self.ppid) # Setup raylet channel options = ray_constants.GLOBAL_GRPC_OPTIONS self.aiogrpc_raylet_channel = ray._private.utils.init_grpc_channel( f"{self.ip}:{self.node_manager_port}", options, asynchronous=True) # Setup grpc server self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0), )) grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0" try: self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server( self.server, f"{grpc_ip}:{self.dashboard_agent_port}") except Exception: # TODO(SongGuyang): Catch the exception here because there is # port conflict issue which brought from static port. We should # remove this after we find better port resolution. logger.exception( "Failed to add port to grpc server. Agent will stay alive but " "disable the grpc service.") self.server = None self.grpc_port = None else: logger.info("Dashboard agent grpc address: %s:%s", grpc_ip, self.grpc_port) # If the agent is started as non-minimal version, http server should # be configured to communicate with the dashboard in a head node. self.http_server = None # Used by the agent and sub-modules. # TODO(architkulkarni): Remove gcs_client once the agent exclusively uses # gcs_aio_client and not gcs_client. self.gcs_client = GcsClient(address=self.gcs_address) _initialize_internal_kv(self.gcs_client) assert _internal_kv_initialized() self.gcs_aio_client = GcsAioClient(address=self.gcs_address) self.publisher = GcsAioPublisher(address=self.gcs_address)
def __init__(self, redis_address, autoscaling_config, redis_password=None, prefix_cluster_info=False, monitor_ip=None, stop_event: Optional[Event] = None): # Initialize the Redis clients. ray.state.state._initialize_global_state( redis_address, redis_password=redis_password) self.redis = ray._private.services.create_redis_client( redis_address, password=redis_password) if monitor_ip: self.redis.set("AutoscalerMetricsAddress", f"{monitor_ip}:{AUTOSCALER_METRIC_PORT}") (ip, port) = redis_address.split(":") # Initialize the gcs stub for getting all node resource usage. gcs_address = self.redis.get("GcsServerAddress").decode("utf-8") options = (("grpc.enable_http_proxy", 0), ) gcs_channel = ray._private.utils.init_grpc_channel( gcs_address, options) # TODO: Use gcs client for this self.gcs_node_resources_stub = \ gcs_service_pb2_grpc.NodeResourceInfoGcsServiceStub(gcs_channel) self.gcs_node_info_stub = \ gcs_service_pb2_grpc.NodeInfoGcsServiceStub(gcs_channel) # Set the redis client and mode so _internal_kv works for autoscaler. worker = ray.worker.global_worker worker.redis_client = self.redis gcs_client = GcsClient.create_from_redis(self.redis) _initialize_internal_kv(gcs_client) worker.mode = 0 head_node_ip = redis_address.split(":")[0] self.redis_address = redis_address self.redis_password = redis_password if os.environ.get("RAY_FAKE_CLUSTER"): self.load_metrics = LoadMetrics(local_ip=FAKE_HEAD_NODE_ID) else: self.load_metrics = LoadMetrics(local_ip=head_node_ip) self.last_avail_resources = None self.event_summarizer = EventSummarizer() self.prefix_cluster_info = prefix_cluster_info # Can be used to signal graceful exit from monitor loop. self.stop_event = stop_event # type: Optional[Event] self.autoscaling_config = autoscaling_config self.autoscaler = None # If set, we are in a manually created cluster (non-autoscaling) and # simply mirroring what the GCS tells us the cluster node types are. self.readonly_config = None self.prom_metrics = AutoscalerPrometheusMetrics() if monitor_ip and prometheus_client: # If monitor_ip wasn't passed in, then don't attempt to start the # metric server to keep behavior identical to before metrics were # introduced try: logger.info( "Starting autoscaler metrics server on port {}".format( AUTOSCALER_METRIC_PORT)) prometheus_client.start_http_server( port=AUTOSCALER_METRIC_PORT, addr="127.0.0.1" if head_node_ip == "127.0.0.1" else "", registry=self.prom_metrics.registry) except Exception: logger.exception( "An exception occurred while starting the metrics server.") elif not prometheus_client: logger.warning("`prometheus_client` not found, so metrics will " "not be exported.") logger.info("Monitor: Started")
def test_external_storage_namespace_isolation(shutdown_only): addr = ray.init(namespace="a", _system_config={ "external_storage_namespace": "c1" }).address_info["address"] gcs_client = GcsClient(address=addr) assert gcs_client.internal_kv_put(b"ABC", b"DEF", True, None) == 1 assert gcs_client.internal_kv_get(b"ABC", None) == b"DEF" ray.shutdown() addr = ray.init(namespace="a", _system_config={ "external_storage_namespace": "c2" }).address_info["address"] gcs_client = GcsClient(address=addr) assert gcs_client.internal_kv_get(b"ABC", None) is None assert gcs_client.internal_kv_put(b"ABC", b"XYZ", True, None) == 1 assert gcs_client.internal_kv_get(b"ABC", None) == b"XYZ" ray.shutdown() addr = ray.init(namespace="a", _system_config={ "external_storage_namespace": "c1" }).address_info["address"] gcs_client = GcsClient(address=addr) assert gcs_client.internal_kv_get(b"ABC", None) == b"DEF"