예제 #1
0
    def _handle_failure(self, error):
        logger.exception("Error in monitor loop")
        if (self.autoscaler is not None and os.environ.get(
                "RAY_AUTOSCALER_FATESHARE_WORKERS", "") == "1"):
            self.autoscaler.kill_workers()
            # Take down autoscaler workers if necessary.
            self.destroy_autoscaler_workers()

        # Something went wrong, so push an error to all current and future
        # drivers.
        message = f"The autoscaler failed with the following error:\n{error}"
        if _internal_kv_initialized():
            _internal_kv_put(ray_constants.DEBUG_AUTOSCALING_ERROR,
                             message,
                             overwrite=True)
        gcs_publisher = GcsPublisher(address=args.gcs_address)
        from ray._private.utils import publish_error_to_driver

        publish_error_to_driver(
            ray_constants.MONITOR_DIED_ERROR,
            message,
            gcs_publisher=gcs_publisher,
        )
예제 #2
0
            args.host,
            args.port,
            args.port_retries,
            args.gcs_address,
            log_dir=args.log_dir,
            temp_dir=args.temp_dir,
            session_dir=args.session_dir,
            minimal=args.minimal,
        )
        loop = asyncio.get_event_loop()
        loop.run_until_complete(dashboard.run())
    except Exception as e:
        traceback_str = ray._private.utils.format_error_message(
            traceback.format_exc())
        message = (f"The dashboard on node {platform.uname()[1]} "
                   f"failed with the following "
                   f"error:\n{traceback_str}")
        if isinstance(e, dashboard_utils.FrontendNotFoundError):
            logger.warning(message)
        else:
            logger.error(message)
            raise e

        # Something went wrong, so push an error to all drivers.
        gcs_publisher = GcsPublisher(address=args.gcs_address)
        ray._private.utils.publish_error_to_driver(
            ray_constants.DASHBOARD_DIED_ERROR,
            message,
            gcs_publisher=gcs_publisher,
        )
예제 #3
0
    except Exception as e:
        # All these env vars should be available because
        # they are provided by the parent raylet.
        restart_count = os.environ["RESTART_COUNT"]
        max_restart_count = os.environ["MAX_RESTART_COUNT"]
        raylet_pid = os.environ["RAY_RAYLET_PID"]
        node_ip = args.node_ip_address
        if restart_count >= max_restart_count:
            # Agent is failed to be started many times.
            # Push an error to all drivers, so that users can know the
            # impact of the issue.
            redis_client = None
            gcs_publisher = None
            if gcs_pubsub_enabled():
                if use_gcs_for_bootstrap():
                    gcs_publisher = GcsPublisher(args.gcs_address)
                else:
                    redis_client = ray._private.services.create_redis_client(
                        args.redis_address, password=args.redis_password)
                    gcs_publisher = GcsPublisher(
                        address=get_gcs_address_from_redis(redis_client))
            else:
                redis_client = ray._private.services.create_redis_client(
                    args.redis_address, password=args.redis_password)

            traceback_str = ray._private.utils.format_error_message(
                traceback.format_exc())
            message = (
                f"(ip={node_ip}) "
                f"The agent on node {platform.uname()[1]} failed to "
                f"be restarted {max_restart_count} "
예제 #4
0
def test_subscribe_two_channels(ray_start_regular):
    """Tests concurrently subscribing to two channels work."""

    address_info = ray_start_regular
    redis = ray._private.services.create_redis_client(
        address_info["redis_address"],
        password=ray.ray_constants.REDIS_DEFAULT_PASSWORD)

    gcs_server_addr = gcs_utils.get_gcs_address_from_redis(redis)

    num_messages = 100

    errors = []

    def receive_errors():
        subscriber = GcsErrorSubscriber(address=gcs_server_addr)
        subscriber.subscribe()
        while len(errors) < num_messages:
            _, msg = subscriber.poll()
            errors.append(msg)

    logs = []

    def receive_logs():
        subscriber = GcsLogSubscriber(address=gcs_server_addr)
        subscriber.subscribe()
        while len(logs) < num_messages:
            log_batch = subscriber.poll()
            logs.append(log_batch)

    t1 = threading.Thread(target=receive_errors)
    t1.start()

    t2 = threading.Thread(target=receive_logs)
    t2.start()

    publisher = GcsPublisher(address=gcs_server_addr)
    for i in range(0, num_messages):
        publisher.publish_error(b"msg_id",
                                ErrorTableData(error_message=f"error {i}"))
        publisher.publish_logs({
            "ip": "127.0.0.1",
            "pid": "gcs",
            "job": "0001",
            "is_err": False,
            "lines": [f"line {i}"],
            "actor_name": "test actor",
            "task_name": "test task",
        })

    t1.join(timeout=10)
    assert not t1.is_alive(), len(errors)
    assert len(errors) == num_messages, len(errors)

    t2.join(timeout=10)
    assert not t2.is_alive(), len(logs)
    assert len(logs) == num_messages, len(logs)

    for i in range(0, num_messages):
        assert errors[i].error_message == f"error {i}"
        assert logs[i]["lines"][0] == f"line {i}"