def _handle_failure(self, error): logger.exception("Error in monitor loop") if (self.autoscaler is not None and os.environ.get( "RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1"): self.autoscaler.kill_workers() # Take down autoscaler workers if necessary. self.destroy_autoscaler_workers() # Something went wrong, so push an error to all current and future # drivers. message = f"The autoscaler failed with the following error:\n{error}" if _internal_kv_initialized(): _internal_kv_put(ray_constants.DEBUG_AUTOSCALING_ERROR, message, overwrite=True) gcs_publisher = GcsPublisher(address=args.gcs_address) from ray._private.utils import publish_error_to_driver publish_error_to_driver( ray_constants.MONITOR_DIED_ERROR, message, gcs_publisher=gcs_publisher, )
args.host, args.port, args.port_retries, args.gcs_address, log_dir=args.log_dir, temp_dir=args.temp_dir, session_dir=args.session_dir, minimal=args.minimal, ) loop = asyncio.get_event_loop() loop.run_until_complete(dashboard.run()) except Exception as e: traceback_str = ray._private.utils.format_error_message( traceback.format_exc()) message = (f"The dashboard on node {platform.uname()[1]} " f"failed with the following " f"error:\n{traceback_str}") if isinstance(e, dashboard_utils.FrontendNotFoundError): logger.warning(message) else: logger.error(message) raise e # Something went wrong, so push an error to all drivers. gcs_publisher = GcsPublisher(address=args.gcs_address) ray._private.utils.publish_error_to_driver( ray_constants.DASHBOARD_DIED_ERROR, message, gcs_publisher=gcs_publisher, )
except Exception as e: # All these env vars should be available because # they are provided by the parent raylet. restart_count = os.environ["RESTART_COUNT"] max_restart_count = os.environ["MAX_RESTART_COUNT"] raylet_pid = os.environ["RAY_RAYLET_PID"] node_ip = args.node_ip_address if restart_count >= max_restart_count: # Agent is failed to be started many times. # Push an error to all drivers, so that users can know the # impact of the issue. redis_client = None gcs_publisher = None if gcs_pubsub_enabled(): if use_gcs_for_bootstrap(): gcs_publisher = GcsPublisher(args.gcs_address) else: redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) gcs_publisher = GcsPublisher( address=get_gcs_address_from_redis(redis_client)) else: redis_client = ray._private.services.create_redis_client( args.redis_address, password=args.redis_password) traceback_str = ray._private.utils.format_error_message( traceback.format_exc()) message = ( f"(ip={node_ip}) " f"The agent on node {platform.uname()[1]} failed to " f"be restarted {max_restart_count} "
def test_subscribe_two_channels(ray_start_regular): """Tests concurrently subscribing to two channels work.""" address_info = ray_start_regular redis = ray._private.services.create_redis_client( address_info["redis_address"], password=ray.ray_constants.REDIS_DEFAULT_PASSWORD) gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis) num_messages = 100 errors = [] def receive_errors(): subscriber = GcsErrorSubscriber(address=gcs_server_addr) subscriber.subscribe() while len(errors) < num_messages: _, msg = subscriber.poll() errors.append(msg) logs = [] def receive_logs(): subscriber = GcsLogSubscriber(address=gcs_server_addr) subscriber.subscribe() while len(logs) < num_messages: log_batch = subscriber.poll() logs.append(log_batch) t1 = threading.Thread(target=receive_errors) t1.start() t2 = threading.Thread(target=receive_logs) t2.start() publisher = GcsPublisher(address=gcs_server_addr) for i in range(0, num_messages): publisher.publish_error(b"msg_id", ErrorTableData(error_message=f"error {i}")) publisher.publish_logs({ "ip": "127.0.0.1", "pid": "gcs", "job": "0001", "is_err": False, "lines": [f"line {i}"], "actor_name": "test actor", "task_name": "test task", }) t1.join(timeout=10) assert not t1.is_alive(), len(errors) assert len(errors) == num_messages, len(errors) t2.join(timeout=10) assert not t2.is_alive(), len(logs) assert len(logs) == num_messages, len(logs) for i in range(0, num_messages): assert errors[i].error_message == f"error {i}" assert logs[i]["lines"][0] == f"line {i}"