def _configure_services(namespace, provider_config): service_field = "services" if service_field not in provider_config: logger.info(log_prefix + not_provided_msg(service_field)) return services = provider_config[service_field] for service in services: if "namespace" not in service["metadata"]: service["metadata"]["namespace"] = namespace elif service["metadata"]["namespace"] != namespace: raise InvalidNamespaceError(service_field, namespace) name = service["metadata"]["name"] field_selector = "metadata.name={}".format(name) services = core_api().list_namespaced_service( namespace, field_selector=field_selector).items if len(services) > 0: assert len(services) == 1 existing_service = services[0] if service == existing_service: logger.info(log_prefix + using_existing_msg("service", name)) return else: logger.info(log_prefix + updating_existing_msg("service", name)) core_api().patch_namespaced_service(name, namespace, service) else: logger.info(log_prefix + not_found_msg("service", name)) core_api().create_namespaced_service(namespace, service) logger.info(log_prefix + created_msg("service", name))
def _configure_namespace(provider_config): namespace_field = "namespace" if namespace_field not in provider_config: raise ValueError("Must specify namespace in Kubernetes config.") namespace = provider_config[namespace_field] field_selector = "metadata.name={}".format(namespace) try: namespaces = core_api().list_namespace( field_selector=field_selector).items except ApiException: logger.warning(log_prefix + not_checking_msg(namespace_field, namespace)) return namespace if len(namespaces) > 0: assert len(namespaces) == 1 logger.info(log_prefix + using_existing_msg(namespace_field, namespace)) return namespace logger.info(log_prefix + not_found_msg(namespace_field, namespace)) namespace_config = client.V1Namespace( metadata=client.V1ObjectMeta(name=namespace)) core_api().create_namespace(namespace_config) logger.info(log_prefix + created_msg(namespace_field, namespace)) return namespace
def terminate_node(self, node_id): logger.info(log_prefix + "calling delete_namespaced_pod") core_api().delete_namespaced_pod(node_id, self.namespace) try: core_api().delete_namespaced_service(node_id, self.namespace) except ApiException: pass try: extensions_beta_api().delete_namespaced_ingress( node_id, self.namespace, ) except ApiException: pass
def create_node(self, node_config, tags, count): conf = node_config.copy() pod_spec = conf.get("pod", conf) service_spec = conf.get("service") ingress_spec = conf.get("ingress") node_uuid = str(uuid4()) tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name tags["ray-node-uuid"] = node_uuid pod_spec["metadata"]["namespace"] = self.namespace if "labels" in pod_spec["metadata"]: pod_spec["metadata"]["labels"].update(tags) else: pod_spec["metadata"]["labels"] = tags logger.info(log_prefix + "calling create_namespaced_pod " "(count={}).".format(count)) new_nodes = [] for _ in range(count): pod = core_api().create_namespaced_pod(self.namespace, pod_spec) new_nodes.append(pod) new_svcs = [] if service_spec is not None: logger.info(log_prefix + "calling create_namespaced_service " "(count={}).".format(count)) for new_node in new_nodes: metadata = service_spec.get("metadata", {}) metadata["name"] = new_node.metadata.name service_spec["metadata"] = metadata service_spec["spec"]["selector"] = {"ray-node-uuid": node_uuid} svc = core_api().create_namespaced_service( self.namespace, service_spec) new_svcs.append(svc) if ingress_spec is not None: logger.info(log_prefix + "calling create_namespaced_ingress " "(count={}).".format(count)) for new_svc in new_svcs: metadata = ingress_spec.get("metadata", {}) metadata["name"] = new_svc.metadata.name ingress_spec["metadata"] = metadata ingress_spec = _add_service_name_to_service_port( ingress_spec, new_svc.metadata.name) extensions_beta_api().create_namespaced_ingress( self.namespace, ingress_spec)
def get_ray_head_pod_ip(config: Dict[str, Any]) -> str: cluster_name = config["cluster_name"] label_selector = f"component=ray-head,ray-cluster-name={cluster_name}" pods = core_api().list_namespaced_pod( namespace=RAY_CLUSTER_NAMESPACE, label_selector=label_selector).items assert (len(pods)) == 1 head_pod = pods.pop() return head_pod.status.pod_ip
def terminate_node(self, node_id): logger.info(log_prefix + "calling delete_namespaced_pod") try: core_api().delete_namespaced_pod(node_id, self.namespace) except ApiException as e: if e.status == 404: logger.warning(log_prefix + f"Tried to delete pod {node_id}," " but the pod was not found (404).") else: raise try: core_api().delete_namespaced_service(node_id, self.namespace) except ApiException: pass try: extensions_beta_api().delete_namespaced_ingress( node_id, self.namespace, ) except ApiException: pass
def prepare_ray_cluster_config() -> str: config_map = core_api().read_namespaced_config_map( name=RAY_CONFIG_MAP, namespace=RAY_CLUSTER_NAMESPACE) # config_map.data consists of a single key:value pair for config_file_name, config_string in config_map.data.items(): config = yaml.safe_load(config_string) config["provider"]["namespace"] = RAY_CLUSTER_NAMESPACE cluster_config_path = os.path.join(RAY_CONFIG_DIR, config_file_name) with open(cluster_config_path, "w") as file: yaml.dump(config, file) return cluster_config_path
def _configure_autoscaler_service_account(namespace, provider_config): account_field = "autoscaler_service_account" if account_field not in provider_config: logger.info(log_prefix + not_provided_msg(account_field)) return account = provider_config[account_field] if "namespace" not in account["metadata"]: account["metadata"]["namespace"] = namespace elif account["metadata"]["namespace"] != namespace: raise InvalidNamespaceError(account_field, namespace) name = account["metadata"]["name"] field_selector = "metadata.name={}".format(name) accounts = core_api().list_namespaced_service_account( namespace, field_selector=field_selector).items if len(accounts) > 0: assert len(accounts) == 1 logger.info(log_prefix + using_existing_msg(account_field, name)) return logger.info(log_prefix + not_found_msg(account_field, name)) core_api().create_namespaced_service_account(namespace, account) logger.info(log_prefix + created_msg(account_field, name))
def non_terminated_nodes(self, tag_filters): # Match pods that are in the 'Pending' or 'Running' phase. # Unfortunately there is no OR operator in field selectors, so we # have to match on NOT any of the other phases. field_selector = ",".join([ "status.phase!=Failed", "status.phase!=Unknown", "status.phase!=Succeeded", "status.phase!=Terminating", ]) tag_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name label_selector = to_label_selector(tag_filters) pod_list = core_api().list_namespaced_pod( self.namespace, field_selector=field_selector, label_selector=label_selector) return [pod.metadata.name for pod in pod_list.items]
def _set_node_tags(self, node_id, tags): pod = core_api().read_namespaced_pod(node_id, self.namespace) pod.metadata.labels.update(tags) core_api().patch_namespaced_pod(node_id, self.namespace, pod)
def internal_ip(self, node_id): pod = core_api().read_namespaced_pod(node_id, self.namespace) return pod.status.pod_ip
def node_tags(self, node_id): pod = core_api().read_namespaced_pod(node_id, self.namespace) return pod.metadata.labels
def is_terminated(self, node_id): pod = core_api().read_namespaced_pod(node_id, self.namespace) return pod.status.phase not in ["Running", "Pending"]
def is_running(self, node_id): pod = core_api().read_namespaced_pod(node_id, self.namespace) return pod.status.phase == "Running"