예제 #1
0
 def test_operator_redis_password(self):
     stop_cmd = "ray stop"
     start_cmd = "ulimit -n 65536; ray start --head --no-monitor"\
         " --dashboard-host 0.0.0.0 --redis-password 1234567"
     cluster_config = {"head_start_ray_commands": [stop_cmd, start_cmd]}
     exception_message = "name,namespace:The Ray Kubernetes Operator does"\
                         " not support setting a custom Redis password in"\
                         " Ray start commands."
     with pytest.raises(ValueError, match=exception_message):
         check_redis_password_not_specified(cluster_config, "name",
                                            "namespace")
     start_cmd = "ulimit -n 65536; ray start --head --no-monitor"\
         " --dashboard-host 0.0.0.0"
     cluster_config = {"head_start_ray_commands": [stop_cmd, start_cmd]}
     check_redis_password_not_specified(cluster_config, "name", "namespace")
예제 #2
0
def _create_or_update_cluster(cluster_cr_body,
                              name,
                              namespace,
                              memo,
                              restart_ray=False):
    """Create, update, or restart the Ray cluster described by a RayCluster
    resource.

    Args:
        cluster_cr_body: The body of the K8s RayCluster resources describing
            a Ray cluster.
        name: The name of the Ray cluster.
        namespace: The K8s namespace in which the Ray cluster runs.
        memo: kopf memo state for this Ray cluster.
        restart_ray: Only restart cluster Ray processes if this is true.
    """
    # Convert the RayCluster custom resource to a Ray autoscaling config.
    cluster_config = operator_utils.cr_to_config(cluster_cr_body)
    # Verify the user didn't set a custom Redis password in Ray start commands.
    # (custom Redis password is not supported by K8s operator.)
    operator_utils.check_redis_password_not_specified(cluster_config, name,
                                                      namespace)

    # Fetch or create the RayCluster python object encapsulating cluster state.
    ray_cluster = memo.get("ray_cluster")
    if ray_cluster is None:
        ray_cluster = RayCluster(cluster_config)
        memo.ray_cluster = ray_cluster

    # Indicate in status.phase that a "create-or-update" is in progress.
    cluster_status_q.put((name, namespace, STATUS_UPDATING))

    # Store the autoscaling config for use by the Ray autoscaler.
    ray_cluster.set_config(cluster_config)

    # Launch a the Ray cluster by SSHing into the pod and running
    # the initialization commands. This will not restart the cluster
    # unless there was a failure.
    ray_cluster.create_or_update(restart_ray=restart_ray)

    # Indicate in status.phase that the head is up and the monitor is running.
    cluster_status_q.put((name, namespace, STATUS_RUNNING))
예제 #3
0
def cluster_action(event_type: str, cluster_cr: Dict[str, Any],
                   cluster_name: str, cluster_namespace: str) -> None:

    cluster_config = operator_utils.cr_to_config(cluster_cr)
    cluster_name = cluster_config["cluster_name"]
    cluster_identifier = (cluster_name, cluster_namespace)

    if event_type == "ADDED":
        operator_utils.check_redis_password_not_specified(
            cluster_config, cluster_identifier)

        cluster_status_q.put((cluster_name, cluster_namespace, "Running"))

        ray_cluster = RayCluster(cluster_config)

        # Track changes to the custom resource's spec field:
        generation = cluster_cr["metadata"]["generation"]
        ray_cluster.set_generation(generation)

        ray_cluster.create_or_update()

        ray_clusters[cluster_identifier] = ray_cluster

    elif event_type == "MODIFIED":
        ray_cluster = ray_clusters[cluster_identifier]
        # Check metadata.generation to determine if there's a spec change.
        current_generation = cluster_cr["metadata"]["generation"]
        # Only update if there's been a change to the spec.
        if current_generation > ray_cluster.get_generation():
            ray_cluster.set_generation(current_generation)
            ray_cluster.set_config(cluster_config)
            ray_cluster.create_or_update()

    elif event_type == "DELETED":
        ray_cluster = ray_clusters[cluster_identifier]
        ray_cluster.clean_up()
        del ray_clusters[cluster_identifier]
예제 #4
0
파일: operator.py 프로젝트: supatomic/ray
def cluster_action(event_type: str, cluster_cr: Dict[str, Any],
                   cluster_name: str, cluster_namespace: str) -> None:

    cluster_config = operator_utils.cr_to_config(cluster_cr)
    cluster_identifier = (cluster_name, cluster_namespace)
    log_prefix = ",".join(cluster_identifier)

    if event_type == "ADDED":
        operator_utils.check_redis_password_not_specified(
            cluster_config, cluster_identifier)

        cluster_status_q.put(
            (cluster_name, cluster_namespace, STATUS_UPDATING))

        ray_cluster = RayCluster(cluster_config)

        # Track changes to the custom resource's spec field:
        generation = cluster_cr["metadata"]["generation"]
        ray_cluster.set_generation(generation)

        logger.info(f"{log_prefix}: Launching cluster.")
        ray_cluster.create_or_update()

        ray_clusters[cluster_identifier] = ray_cluster

        cluster_status_q.put((cluster_name, cluster_namespace, STATUS_RUNNING))

    elif event_type == "MODIFIED":
        ray_cluster = ray_clusters[cluster_identifier]
        # Check metadata.generation to determine if there's a spec change.
        current_generation = cluster_cr["metadata"]["generation"]
        # Check metadata.labels.autoscalerRetries to see if we need to restart
        # Ray processes.
        status = cluster_cr.get("status", {})
        autoscaler_retries = status.get(AUTOSCALER_RETRIES_FIELD, 0)

        # True if there's been a chamge to the spec of the custom resource,
        # triggering an increment of metadata.generation:
        spec_changed = current_generation > ray_cluster.get_generation()
        # True if monitor has failed, triggering an increment of
        # status.autoscalerRetries:
        ray_restart_required = (autoscaler_retries >
                                ray_cluster.get_num_retries())
        if ray_restart_required:
            logger.error(f"{log_prefix}: Failed, restarting cluster.")
            ray_cluster.set_num_retries(autoscaler_retries)
        if spec_changed:
            logger.info(f"{log_prefix}: Updating cluster.")
            ray_cluster.set_generation(current_generation)

        # Update if there's been a change to the spec or if we're attempting
        # recovery from autoscaler failure.
        if spec_changed or ray_restart_required:
            cluster_status_q.put(
                (cluster_name, cluster_namespace, STATUS_UPDATING))
            ray_cluster.set_config(cluster_config)
            # Trigger Ray restart only if there's been a failure.
            ray_cluster.create_or_update(restart_ray=ray_restart_required)
            cluster_status_q.put(
                (cluster_name, cluster_namespace, STATUS_RUNNING))

    elif event_type == "DELETED":
        ray_cluster = ray_clusters[cluster_identifier]
        ray_cluster.clean_up()
        del ray_clusters[cluster_identifier]