def test_operator_redis_password(self): stop_cmd = "ray stop" start_cmd = "ulimit -n 65536; ray start --head --no-monitor"\ " --dashboard-host 0.0.0.0 --redis-password 1234567" cluster_config = {"head_start_ray_commands": [stop_cmd, start_cmd]} exception_message = "name,namespace:The Ray Kubernetes Operator does"\ " not support setting a custom Redis password in"\ " Ray start commands." with pytest.raises(ValueError, match=exception_message): check_redis_password_not_specified(cluster_config, "name", "namespace") start_cmd = "ulimit -n 65536; ray start --head --no-monitor"\ " --dashboard-host 0.0.0.0" cluster_config = {"head_start_ray_commands": [stop_cmd, start_cmd]} check_redis_password_not_specified(cluster_config, "name", "namespace")
def _create_or_update_cluster(cluster_cr_body, name, namespace, memo, restart_ray=False): """Create, update, or restart the Ray cluster described by a RayCluster resource. Args: cluster_cr_body: The body of the K8s RayCluster resources describing a Ray cluster. name: The name of the Ray cluster. namespace: The K8s namespace in which the Ray cluster runs. memo: kopf memo state for this Ray cluster. restart_ray: Only restart cluster Ray processes if this is true. """ # Convert the RayCluster custom resource to a Ray autoscaling config. cluster_config = operator_utils.cr_to_config(cluster_cr_body) # Verify the user didn't set a custom Redis password in Ray start commands. # (custom Redis password is not supported by K8s operator.) operator_utils.check_redis_password_not_specified(cluster_config, name, namespace) # Fetch or create the RayCluster python object encapsulating cluster state. ray_cluster = memo.get("ray_cluster") if ray_cluster is None: ray_cluster = RayCluster(cluster_config) memo.ray_cluster = ray_cluster # Indicate in status.phase that a "create-or-update" is in progress. cluster_status_q.put((name, namespace, STATUS_UPDATING)) # Store the autoscaling config for use by the Ray autoscaler. ray_cluster.set_config(cluster_config) # Launch a the Ray cluster by SSHing into the pod and running # the initialization commands. This will not restart the cluster # unless there was a failure. ray_cluster.create_or_update(restart_ray=restart_ray) # Indicate in status.phase that the head is up and the monitor is running. cluster_status_q.put((name, namespace, STATUS_RUNNING))
def cluster_action(event_type: str, cluster_cr: Dict[str, Any], cluster_name: str, cluster_namespace: str) -> None: cluster_config = operator_utils.cr_to_config(cluster_cr) cluster_name = cluster_config["cluster_name"] cluster_identifier = (cluster_name, cluster_namespace) if event_type == "ADDED": operator_utils.check_redis_password_not_specified( cluster_config, cluster_identifier) cluster_status_q.put((cluster_name, cluster_namespace, "Running")) ray_cluster = RayCluster(cluster_config) # Track changes to the custom resource's spec field: generation = cluster_cr["metadata"]["generation"] ray_cluster.set_generation(generation) ray_cluster.create_or_update() ray_clusters[cluster_identifier] = ray_cluster elif event_type == "MODIFIED": ray_cluster = ray_clusters[cluster_identifier] # Check metadata.generation to determine if there's a spec change. current_generation = cluster_cr["metadata"]["generation"] # Only update if there's been a change to the spec. if current_generation > ray_cluster.get_generation(): ray_cluster.set_generation(current_generation) ray_cluster.set_config(cluster_config) ray_cluster.create_or_update() elif event_type == "DELETED": ray_cluster = ray_clusters[cluster_identifier] ray_cluster.clean_up() del ray_clusters[cluster_identifier]
def cluster_action(event_type: str, cluster_cr: Dict[str, Any], cluster_name: str, cluster_namespace: str) -> None: cluster_config = operator_utils.cr_to_config(cluster_cr) cluster_identifier = (cluster_name, cluster_namespace) log_prefix = ",".join(cluster_identifier) if event_type == "ADDED": operator_utils.check_redis_password_not_specified( cluster_config, cluster_identifier) cluster_status_q.put( (cluster_name, cluster_namespace, STATUS_UPDATING)) ray_cluster = RayCluster(cluster_config) # Track changes to the custom resource's spec field: generation = cluster_cr["metadata"]["generation"] ray_cluster.set_generation(generation) logger.info(f"{log_prefix}: Launching cluster.") ray_cluster.create_or_update() ray_clusters[cluster_identifier] = ray_cluster cluster_status_q.put((cluster_name, cluster_namespace, STATUS_RUNNING)) elif event_type == "MODIFIED": ray_cluster = ray_clusters[cluster_identifier] # Check metadata.generation to determine if there's a spec change. current_generation = cluster_cr["metadata"]["generation"] # Check metadata.labels.autoscalerRetries to see if we need to restart # Ray processes. status = cluster_cr.get("status", {}) autoscaler_retries = status.get(AUTOSCALER_RETRIES_FIELD, 0) # True if there's been a chamge to the spec of the custom resource, # triggering an increment of metadata.generation: spec_changed = current_generation > ray_cluster.get_generation() # True if monitor has failed, triggering an increment of # status.autoscalerRetries: ray_restart_required = (autoscaler_retries > ray_cluster.get_num_retries()) if ray_restart_required: logger.error(f"{log_prefix}: Failed, restarting cluster.") ray_cluster.set_num_retries(autoscaler_retries) if spec_changed: logger.info(f"{log_prefix}: Updating cluster.") ray_cluster.set_generation(current_generation) # Update if there's been a change to the spec or if we're attempting # recovery from autoscaler failure. if spec_changed or ray_restart_required: cluster_status_q.put( (cluster_name, cluster_namespace, STATUS_UPDATING)) ray_cluster.set_config(cluster_config) # Trigger Ray restart only if there's been a failure. ray_cluster.create_or_update(restart_ray=ray_restart_required) cluster_status_q.put( (cluster_name, cluster_namespace, STATUS_RUNNING)) elif event_type == "DELETED": ray_cluster = ray_clusters[cluster_identifier] ray_cluster.clean_up() del ray_clusters[cluster_identifier]