Exemplo n.º 1
0
def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str):
    """Wait until the Ray head container is ready. Then start the autoscaler."""
    _setup_logging()
    head_ip = get_node_ip_address()
    ray_address = f"{head_ip}:6379"
    while True:
        try:
            subprocess.check_call(
                ["ray", "health-check", "--address", ray_address])
            logger.info("The Ray head is ready. Starting the autoscaler.")
            break
        except subprocess.CalledProcessError:
            logger.warning("The Ray head is not yet ready.")
            logger.warning(f"Will check again in {BACKOFF_S} seconds.")
            time.sleep(BACKOFF_S)

    # autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR
    # to output an autoscaling config.
    autoscaling_config_producer = AutoscalingConfigProducer(
        cluster_name, cluster_namespace)

    Monitor(
        address=ray_address,
        # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
        # In this case, it's a callable.
        autoscaling_config=autoscaling_config_producer,
        monitor_ip=head_ip,
    ).run()
Exemplo n.º 2
0
def _run_autoscaler(cluster_name: str,
                    cluster_namespace: str,
                    redis_password: str = ""):
    _setup_logging()
    head_ip = get_node_ip_address()

    autoscaling_config_producer = AutoscalingConfigProducer(
        cluster_name, cluster_namespace)

    Monitor(
        address=f"{head_ip}:6379",
        redis_password=redis_password,
        # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
        # In this case, it's a callable.
        autoscaling_config=autoscaling_config_producer,
        monitor_ip=head_ip,
    ).run()
Exemplo n.º 3
0
def run_kuberay_autoscaler(cluster_name: str, cluster_namespace: str):
    """Wait until the Ray head container is ready. Then start the autoscaler."""
    head_ip = get_node_ip_address()
    ray_address = f"{head_ip}:6379"
    while True:
        try:
            # Autoscaler Ray version might not exactly match GCS version, so skip the
            # version check when checking GCS status.
            subprocess.check_call(
                [
                    "ray",
                    "health-check",
                    "--address",
                    ray_address,
                    "--skip-version-check",
                ]
            )
            # Logging is not ready yet. Print to stdout for now.
            print("The Ray head is ready. Starting the autoscaler.")
            break
        except subprocess.CalledProcessError:
            print("The Ray head is not yet ready.")
            print(f"Will check again in {BACKOFF_S} seconds.")
            time.sleep(BACKOFF_S)

    # The Ray head container sets up the log directory. Thus, we set up logging
    # only after the Ray head is ready.
    _setup_logging()

    # autoscaling_config_producer reads the RayCluster CR from K8s and uses the CR
    # to output an autoscaling config.
    autoscaling_config_producer = AutoscalingConfigProducer(
        cluster_name, cluster_namespace
    )

    Monitor(
        address=ray_address,
        # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
        # In this case, it's a callable.
        autoscaling_config=autoscaling_config_producer,
        monitor_ip=head_ip,
        # Let the autoscaler process exit after it hits 5 exceptions.
        # (See ray.autoscaler._private.constants.AUTOSCALER_MAX_NUM_FAILURES.)
        # Kubernetes will then restart the autoscaler container.
        retry_on_failure=False,
    ).run()
Exemplo n.º 4
0
    root_logger.setLevel(logging.INFO)

    root_handler = logging.StreamHandler()
    root_handler.setLevel(logging.INFO)
    root_handler.setFormatter(logging.Formatter(ray_constants.LOGGER_FORMAT))

    root_logger.addHandler(root_handler)


if __name__ == "__main__":
    setup_logging()

    parser = argparse.ArgumentParser(description="Kuberay Autoscaler")
    parser.add_argument(
        "--redis-password",
        required=False,
        type=str,
        default=None,
        help="The password to use for Redis")
    args = parser.parse_args()

    cluster_name = yaml.safe_load(
        open(AUTOSCALING_CONFIG_PATH).read())["cluster_name"]
    head_ip = get_node_ip_address()
    Monitor(
        address=f"{head_ip}:6379",
        redis_password=args.redis_password,
        autoscaling_config=AUTOSCALING_CONFIG_PATH,
        monitor_ip=head_ip,
    ).run()
Exemplo n.º 5
0
def setup_monitor(address):
    monitor = Monitor(
        address, None, redis_password=ray_constants.REDIS_DEFAULT_PASSWORD)
    return monitor
Exemplo n.º 6
0
    parser.add_argument(
        "--cluster-namespace",
        required=True,
        type=str,
        help="The Kubernetes namespace the Ray Cluster lives in.\n"
        "Should coincide with the `metadata.namespace` of the RayCluster CR.",
    )
    parser.add_argument(
        "--redis-password",
        required=False,
        type=str,
        default=None,
        help="The password to use for Redis",
    )
    args = parser.parse_args()

    head_ip = get_node_ip_address()

    autoscaling_config_producer = AutoscalingConfigProducer(
        args.cluster_name, args.cluster_namespace
    )

    Monitor(
        address=f"{head_ip}:6379",
        redis_password=args.redis_password,
        # The `autoscaling_config` arg can be a dict or a `Callable: () -> dict`.
        # In this case, it's a callable.
        autoscaling_config=autoscaling_config_producer,
        monitor_ip=head_ip,
    ).run()