def status_handling_loop(): while True: cluster_name, cluster_namespace, phase = cluster_status_q.get() try: operator_utils.set_status(cluster_name, cluster_namespace, phase) except Exception: log_prefix = ",".join(cluster_name, cluster_namespace) logger.exception(f"{log_prefix}: Error setting RayCluster status.")
def handle_event(event_type, cluster_cr, cluster_name): # TODO: This only detects errors in the parent process and thus doesn't # catch cluster-specific autoscaling failures. Fix that (perhaps at # the same time that we eliminate subprocesses). try: cluster_action(event_type, cluster_cr, cluster_name) except Exception: logger.exception(f"Error while updating RayCluster {cluster_name}.") operator_utils.set_status(cluster_cr, cluster_name, "Error")
def status_handling_loop(queue: mp.Queue): # TODO: Status will not be set if Operator restarts after `queue.put` # but before `set_status`. while True: item = queue.get() if item is None: break cluster_name, cluster_namespace, phase = item try: operator_utils.set_status(cluster_name, cluster_namespace, phase) except Exception: log_prefix = ",".join([cluster_name, cluster_namespace]) logger.exception(f"{log_prefix}: Error setting RayCluster status.")
def cluster_action(event_type, cluster_cr, cluster_name) -> None: cluster_config = operator_utils.cr_to_config(cluster_cr) cluster_name = cluster_config["cluster_name"] if event_type == "ADDED": operator_utils.set_status(cluster_cr, cluster_name, "Running") ray_clusters[cluster_name] = RayCluster(cluster_config) ray_clusters[cluster_name].create_or_update() last_generation[cluster_name] = cluster_cr["metadata"]["generation"] elif event_type == "MODIFIED": # Check metadata.generation to determine if there's a spec change. current_generation = cluster_cr["metadata"]["generation"] if current_generation > last_generation[cluster_name]: ray_clusters[cluster_name].set_config(cluster_config) ray_clusters[cluster_name].create_or_update() last_generation[cluster_name] = current_generation elif event_type == "DELETED": ray_clusters[cluster_name].clean_up() del ray_clusters[cluster_name] del last_generation[cluster_name]
def cluster_action(event_type: str, cluster_cr: Dict[str, Any], cluster_name: str, cluster_namespace: str) -> None: cluster_config = operator_utils.cr_to_config(cluster_cr) cluster_name = cluster_config["cluster_name"] cluster_identifier = (cluster_name, cluster_namespace) if event_type == "ADDED": operator_utils.set_status(cluster_cr, cluster_name, cluster_namespace, "Running") ray_cluster = RayCluster(cluster_config) # Track changes to the custom resource's spec field: generation = cluster_cr["metadata"]["generation"] ray_cluster.set_generation(generation) ray_cluster.create_or_update() ray_clusters[cluster_identifier] = ray_cluster elif event_type == "MODIFIED": ray_cluster = ray_clusters[cluster_identifier] # Check metadata.generation to determine if there's a spec change. current_generation = cluster_cr["metadata"]["generation"] # Only update if there's been a change to the spec. if current_generation > ray_cluster.get_generation(): ray_cluster.set_generation(current_generation) ray_cluster.set_config(cluster_config) ray_cluster.create_or_update() elif event_type == "DELETED": ray_cluster = ray_clusters[cluster_identifier] ray_cluster.clean_up() del ray_clusters[cluster_identifier]
def status_handling_loop(): while True: cluster_name, cluster_namespace, status = cluster_status_q.get() operator_utils.set_status(cluster_name, cluster_namespace, status)