async def run(self): async def _check_parent(): """Check if raylet is dead and fate-share if it is.""" try: curr_proc = psutil.Process() while True: parent = curr_proc.parent() if (parent is None or parent.pid == 1 or self.ppid != parent.pid): logger.error("Raylet is dead, exiting.") sys.exit(0) await asyncio.sleep( dashboard_consts. DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS) except Exception: logger.error("Failed to check parent PID, exiting.") sys.exit(1) if sys.platform not in ["win32", "cygwin"]: check_parent_task = create_task(_check_parent()) if not use_gcs_for_bootstrap(): # Create an aioredis client for all modules. try: self.aioredis_client = \ await dashboard_utils.get_aioredis_client( self.redis_address, self.redis_password, dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS, dashboard_consts.RETRY_REDIS_CONNECTION_TIMES) except (socket.gaierror, ConnectionRefusedError): logger.error( "Dashboard agent exiting: " "Failed to connect to redis at %s", self.redis_address) sys.exit(-1) # Create a http session for all modules. # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore if LooseVersion(aiohttp.__version__) < LooseVersion("4.0.0"): self.http_session = aiohttp.ClientSession( loop=asyncio.get_event_loop()) else: self.http_session = aiohttp.ClientSession() # Start a grpc asyncio server. await self.server.start() if not use_gcs_for_bootstrap(): gcs_address = await self.aioredis_client.get( dashboard_consts.GCS_SERVER_ADDRESS) self.gcs_client = GcsClient(address=gcs_address.decode()) else: self.gcs_client = GcsClient(address=self.gcs_address) modules = self._load_modules() # Http server should be initialized after all modules loaded. app = aiohttp.web.Application() app.add_routes(routes=routes.bound_routes()) # Enable CORS on all routes. cors = aiohttp_cors.setup(app, defaults={ "*": aiohttp_cors.ResourceOptions( allow_credentials=True, expose_headers="*", allow_methods="*", allow_headers=("Content-Type", "X-Header"), ) }) for route in list(app.router.routes()): cors.add(route) runner = aiohttp.web.AppRunner(app) await runner.setup() site = aiohttp.web.TCPSite( runner, "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0", self.listen_port) await site.start() http_host, http_port, *_ = site._server.sockets[0].getsockname() logger.info("Dashboard agent http address: %s:%s", http_host, http_port) # Dump registered http routes. dump_routes = [ r for r in app.router.routes() if r.method != hdrs.METH_HEAD ] for r in dump_routes: logger.info(r) logger.info("Registered %s routes.", len(dump_routes)) # Write the dashboard agent port to redis. # TODO: Use async version if performance is an issue internal_kv._internal_kv_put( f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}", json.dumps([http_port, self.grpc_port]), namespace=ray_constants.KV_NAMESPACE_DASHBOARD) # Register agent to agent manager. raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub( self.aiogrpc_raylet_channel) await raylet_stub.RegisterAgent( agent_manager_pb2.RegisterAgentRequest(agent_pid=os.getpid(), agent_port=self.grpc_port, agent_ip_address=self.ip)) tasks = [m.run(self.server) for m in modules] if sys.platform not in ["win32", "cygwin"]: tasks.append(check_parent_task) await asyncio.gather(*tasks) await self.server.wait_for_termination() # Wait for finish signal. await runner.cleanup()
async def run(self): async def _check_parent(): """Check if raylet is dead.""" curr_proc = psutil.Process() while True: parent = curr_proc.parent() if parent is None or parent.pid == 1: logger.error("raylet is dead, agent will die because " "it fate-shares with raylet.") sys.exit(0) await asyncio.sleep( dashboard_consts. DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS) check_parent_task = create_task(_check_parent()) # Create an aioredis client for all modules. try: self.aioredis_client = await dashboard_utils.get_aioredis_client( self.redis_address, self.redis_password, dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS, dashboard_consts.RETRY_REDIS_CONNECTION_TIMES) except (socket.gaierror, ConnectionRefusedError): logger.error( "Dashboard agent exiting: " "Failed to connect to redis at %s", self.redis_address) sys.exit(-1) # Create a http session for all modules. self.http_session = aiohttp.ClientSession( loop=asyncio.get_event_loop()) # Start a grpc asyncio server. await self.server.start() modules = self._load_modules() # Http server should be initialized after all modules loaded. app = aiohttp.web.Application() app.add_routes(routes=routes.bound_routes()) # Enable CORS on all routes. cors = aiohttp_cors.setup( app, defaults={ "*": aiohttp_cors.ResourceOptions( allow_credentials=True, expose_headers="*", allow_methods="*", allow_headers=("Content-Type", "X-Header"), ) }) for route in list(app.router.routes()): cors.add(route) runner = aiohttp.web.AppRunner(app) await runner.setup() site = aiohttp.web.TCPSite(runner, self.ip, 0) await site.start() http_host, http_port = site._server.sockets[0].getsockname() logger.info("Dashboard agent http address: %s:%s", http_host, http_port) # Dump registered http routes. dump_routes = [ r for r in app.router.routes() if r.method != hdrs.METH_HEAD ] for r in dump_routes: logger.info(r) logger.info("Registered %s routes.", len(dump_routes)) # Write the dashboard agent port to redis. await self.aioredis_client.set( f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}", json.dumps([http_port, self.grpc_port])) # Register agent to agent manager. raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub( self.aiogrpc_raylet_channel) await raylet_stub.RegisterAgent( agent_manager_pb2.RegisterAgentRequest( agent_pid=os.getpid(), agent_port=self.grpc_port, agent_ip_address=self.ip)) await asyncio.gather(check_parent_task, *(m.run(self.server) for m in modules)) await self.server.wait_for_termination() # Wait for finish signal. await runner.cleanup()
async def run(self): async def _check_parent(): """Check if raylet is dead and fate-share if it is.""" try: curr_proc = psutil.Process() while True: parent = curr_proc.parent() if parent is None or parent.pid == 1 or self.ppid != parent.pid: logger.error("Raylet is dead, exiting.") sys.exit(0) await asyncio.sleep( dashboard_consts.DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS ) except Exception: logger.error("Failed to check parent PID, exiting.") sys.exit(1) if sys.platform not in ["win32", "cygwin"]: check_parent_task = create_task(_check_parent()) # Start a grpc asyncio server. if self.server: await self.server.start() self.gcs_client = GcsClient(address=self.gcs_address) modules = self._load_modules() # Setup http server if necessary. if not self.minimal: # If the agent is not minimal it should start the http server # to communicate with the dashboard in a head node. # Http server is not started in the minimal version because # it requires additional dependencies that are not # included in the minimal ray package. try: self.http_server = await self._configure_http_server(modules) except Exception: # TODO(SongGuyang): Catch the exception here because there is # port conflict issue which brought from static port. We should # remove this after we find better port resolution. logger.exception( "Failed to start http server. Agent will stay alive but " "disable the http service." ) # Write the dashboard agent port to kv. # TODO: Use async version if performance is an issue # -1 should indicate that http server is not started. http_port = -1 if not self.http_server else self.http_server.http_port internal_kv._internal_kv_put( f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}", json.dumps([http_port, self.grpc_port]), namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ) # Register agent to agent manager. raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub( self.aiogrpc_raylet_channel ) await raylet_stub.RegisterAgent( agent_manager_pb2.RegisterAgentRequest( agent_id=self.agent_id, agent_port=self.grpc_port, agent_ip_address=self.ip, ) ) tasks = [m.run(self.server) for m in modules] if sys.platform not in ["win32", "cygwin"]: tasks.append(check_parent_task) await asyncio.gather(*tasks) await self.server.wait_for_termination() if self.http_server: await self.http_server.cleanup()
async def run(self): async def _check_parent(): """Check if raylet is dead and fate-share if it is.""" try: curr_proc = psutil.Process() while True: parent = curr_proc.parent() if parent is None or parent.pid == 1 or self.ppid != parent.pid: log_path = os.path.join(self.log_dir, "raylet.out") error = False msg = f"Raylet is terminated: ip={self.ip}, id={self.node_id}. " try: with open(log_path, "r", encoding="utf-8") as f: # Seek to _RAYLET_LOG_MAX_TAIL_SIZE from the end if the # file is larger than that. f.seek(0, io.SEEK_END) pos = max(0, f.tell() - _RAYLET_LOG_MAX_TAIL_SIZE) f.seek(pos, io.SEEK_SET) # Read remaining logs by lines. raylet_logs = f.readlines() # Assume the SIGTERM message must exist within the last # _RAYLET_LOG_MAX_TAIL_SIZE of the log file. if any("Raylet received SIGTERM" in line for line in raylet_logs): msg += "Termination is graceful." logger.info(msg) else: msg += ( "Termination is unexpected. Possible reasons " "include: (1) SIGKILL by the user or system " "OOM killer, (2) Invalid memory access from " "Raylet causing SIGSEGV or SIGBUS, " "(3) Other termination signals. " f"Last {_RAYLET_LOG_MAX_PUBLISH_LINES} lines " "of the Raylet logs:\n") msg += " " + " ".join(raylet_logs[ -_RAYLET_LOG_MAX_PUBLISH_LINES:]) error = True except Exception as e: msg += f"Failed to read Raylet logs at {log_path}: {e}!" logger.exception() error = True if error: logger.error(msg) # TODO: switch to async if necessary. ray._private.utils.publish_error_to_driver( ray_constants.RAYLET_DIED_ERROR, msg, gcs_publisher=GcsPublisher( address=self.gcs_address), ) else: logger.info(msg) sys.exit(0) await asyncio.sleep( dashboard_consts. DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS) except Exception: logger.error("Failed to check parent PID, exiting.") sys.exit(1) if sys.platform not in ["win32", "cygwin"]: check_parent_task = create_task(_check_parent()) # Start a grpc asyncio server. if self.server: await self.server.start() modules = self._load_modules() # Setup http server if necessary. if not self.minimal: # If the agent is not minimal it should start the http server # to communicate with the dashboard in a head node. # Http server is not started in the minimal version because # it requires additional dependencies that are not # included in the minimal ray package. try: self.http_server = await self._configure_http_server(modules) except Exception: # TODO(SongGuyang): Catch the exception here because there is # port conflict issue which brought from static port. We should # remove this after we find better port resolution. logger.exception( "Failed to start http server. Agent will stay alive but " "disable the http service.") # Write the dashboard agent port to kv. # TODO: Use async version if performance is an issue # -1 should indicate that http server is not started. http_port = -1 if not self.http_server else self.http_server.http_port internal_kv._internal_kv_put( f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}", json.dumps([http_port, self.grpc_port]), namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ) # Register agent to agent manager. raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub( self.aiogrpc_raylet_channel) await raylet_stub.RegisterAgent( agent_manager_pb2.RegisterAgentRequest( agent_pid=os.getpid(), agent_port=self.grpc_port, agent_ip_address=self.ip, )) tasks = [m.run(self.server) for m in modules] if sys.platform not in ["win32", "cygwin"]: tasks.append(check_parent_task) await asyncio.gather(*tasks) await self.server.wait_for_termination() if self.http_server: await self.http_server.cleanup()
async def run(self): async def _check_parent(): """Check if raylet is dead and fate-share if it is.""" try: curr_proc = psutil.Process() while True: parent = curr_proc.parent() if parent is None or parent.pid == 1 or self.ppid != parent.pid: logger.error("Raylet is dead, exiting.") sys.exit(0) await asyncio.sleep( dashboard_consts. DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_SECONDS) except Exception: logger.error("Failed to check parent PID, exiting.") sys.exit(1) if sys.platform not in ["win32", "cygwin"]: check_parent_task = create_task(_check_parent()) if not use_gcs_for_bootstrap(): # Create an aioredis client for all modules. try: self.aioredis_client = await dashboard_utils.get_aioredis_client( self.redis_address, self.redis_password, dashboard_consts.CONNECT_REDIS_INTERNAL_SECONDS, dashboard_consts.RETRY_REDIS_CONNECTION_TIMES, ) except (socket.gaierror, ConnectionRefusedError): logger.error( "Dashboard agent exiting: " "Failed to connect to redis at %s", self.redis_address, ) sys.exit(-1) # Start a grpc asyncio server. await self.server.start() if not use_gcs_for_bootstrap(): gcs_address = await self.aioredis_client.get( dashboard_consts.GCS_SERVER_ADDRESS) self.gcs_client = GcsClient(address=gcs_address.decode()) else: self.gcs_client = GcsClient(address=self.gcs_address) modules = self._load_modules() # Setup http server if necessary. if not self.minimal: # If the agent is not minimal it should start the http server # to communicate with the dashboard in a head node. # Http server is not started in the minimal version because # it requires additional dependencies that are not # included in the minimal ray package. self.http_server = await self._configure_http_server(modules) # Write the dashboard agent port to redis. # TODO: Use async version if performance is an issue # -1 should indicate that http server is not started. http_port = -1 if not self.http_server else self.http_server.http_port internal_kv._internal_kv_put( f"{dashboard_consts.DASHBOARD_AGENT_PORT_PREFIX}{self.node_id}", json.dumps([http_port, self.grpc_port]), namespace=ray_constants.KV_NAMESPACE_DASHBOARD, ) # Register agent to agent manager. raylet_stub = agent_manager_pb2_grpc.AgentManagerServiceStub( self.aiogrpc_raylet_channel) await raylet_stub.RegisterAgent( agent_manager_pb2.RegisterAgentRequest( agent_pid=os.getpid(), agent_port=self.grpc_port, agent_ip_address=self.ip, )) tasks = [m.run(self.server) for m in modules] if sys.platform not in ["win32", "cygwin"]: tasks.append(check_parent_task) await asyncio.gather(*tasks) await self.server.wait_for_termination() if self.http_server: await self.http_server.cleanup()