def test_crud_namespace(self): # given s = Staroid(access_token=os.environ["STAROID_ACCESS_TOKEN"], account=os.environ["STAROID_ACCOUNT"]) c = s.cluster().create("staroid-python it-test-namespace") # when create a namespace ns_api = s.namespace(c) ns = ns_api.create("instance1", "GITHUB/staroids/namespace:master") # then namespace becomes RUNNING wait_for_phase(ns_api, ns, "RUNNING") self.assertEqual("RUNNING", ns_api.get_by_id(ns.id()).phase()) # start shell ns_api.shell_start("instance1") resources = ns_api.get_all_resources("instance1") self.assertTrue(len(resources["services"]) > 0) # start tunnel ns_api.start_tunnel("instance1", ["57683:localhost:57683"]) # stop tunnel ns_api.stop_tunnel("instance1") # stop shell ns_api.shell_stop("instance1") # pause ns = ns_api.stop("instance1") wait_for_phase(ns_api, ns, "PAUSED") self.assertEqual("PAUSED", ns_api.get_by_id(ns.id()).phase()) # resume ns = ns_api.start("instance1") wait_for_phase(ns_api, ns, "RUNNING") self.assertEqual("RUNNING", ns_api.get_by_id(ns.id()).phase()) # when delete a namespace ns = ns_api.delete("instance1") # then namespace becomes REMOVED wait_for_phase(ns_api, ns, "REMOVED") self.assertEqual("REMOVED", ns_api.get_by_id(ns.id()).phase()) # when delete s.cluster().delete("staroid-python it-test-namespace")
class StaroidNodeProvider(NodeProvider): def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) self.__cached = {} self.__star = Staroid( access_token=provider_config["access_token"], account=provider_config["account"], ) self.__ske = self._get_config_or_env(provider_config, "ske", "STAROID_SKE") self.__ske_region = self._get_config_or_env( provider_config, "ske_region", "STAROID_SKE_REGION" ) self._requests_lib = _try_import_requests() def _get_config_or_env(self, config, config_key, env_name): value = None # check env first, so config can override env later if env_name in os.environ: value = os.environ[env_name] if config_key in config and config[config_key] is not None: value = config[config_key] return value def _connect_kubeapi_incluster(self, instance_name): if not os.path.isdir("/var/run/secrets/kubernetes.io/serviceaccount"): return None kube_conf = config.load_incluster_config() kube_client = client.ApiClient(kube_conf) with open( "/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r" ) as file: namespace = file.read().replace("\n", "") self.__cached[instance_name] = {"kube_client": kube_client, "api_server": None} self.namespace = namespace return kube_client def _connect_kubeapi(self, instance_name): if instance_name in self.__cached: return self.__cached[instance_name]["kube_client"] # try incluster configuration first kube_client = self._connect_kubeapi_incluster(instance_name) if kube_client is not None: return kube_client # check if ske exists cluster_api = self.__star.cluster() ske = cluster_api.get(self.__ske) if ske is None: # ske not exists return None # check if ray cluster instance exists ns_api = self.__star.namespace(ske) ns = ns_api.get(instance_name) if ns is None: # instance not exists return None # check if staroid namespace is not PAUSED (stopped) # or INACTIVE (terminated) if ns.status() != "ACTIVE": return None # wait for the staroid namespace to be started start_time = time.time() timeout = 300 started = False while time.time() - start_time < timeout: if ns.phase() == "RUNNING": started = True break time.sleep(3) ns = ns_api.get(instance_name) if started is False: logger.info(log_prefix + "fail to start namespace") return None # start a shell service to create secure tunnel ns_api.shell_start(instance_name) local_port = find_free_port() # fixed port number for kube api access through # shell service in staroid remote_port = 57683 # start a secure tunnel ns_api.start_tunnel( instance_name, ["{}:localhost:{}".format(local_port, remote_port)] ) # wait for tunnel to be established by checking /version local_kube_api_addr = "http://localhost:{}".format(local_port) start_time = time.time() established = False while time.time() - start_time < timeout: try: r = self._requests_lib.get( "{}/version".format(local_kube_api_addr), timeout=(3, 5) ) if r.status_code == 200: established = True break except self._requests_lib.exceptions.ConnectionError: pass time.sleep(3) if established: kube_conf = client.Configuration() kube_conf.host = local_kube_api_addr kube_client = client.ApiClient(kube_conf) self.__cached[instance_name] = { "kube_client": kube_client, "api_server": local_kube_api_addr, } self.namespace = ns.namespace() return kube_client else: self.__cached[instance_name] = None return None def non_terminated_nodes(self, tag_filters): instance_name = self.cluster_name kube_client = self._connect_kubeapi(instance_name) if kube_client is None: return [] core_api = client.CoreV1Api(kube_client) # Match pods that are in the 'Pending' or 'Running' phase. # Unfortunately there is no OR operator in field selectors, so we # have to match on NOT any of the other phases. field_selector = ",".join( [ "status.phase!=Failed", "status.phase!=Unknown", "status.phase!=Succeeded", "status.phase!=Terminating", ] ) tag_filters[TAG_RAY_CLUSTER_NAME] = self.cluster_name label_selector = to_label_selector(tag_filters) pod_list = core_api.list_namespaced_pod( self.namespace, field_selector=field_selector, label_selector=label_selector ) return [pod.metadata.name for pod in pod_list.items] def is_running(self, node_id): kube_client = self.__cached[self.cluster_name]["kube_client"] core_api = client.CoreV1Api(kube_client) pod = core_api.read_namespaced_pod(node_id, self.namespace) return pod.status.phase == "Running" def is_terminated(self, node_id): kube_client = self.__cached[self.cluster_name]["kube_client"] core_api = client.CoreV1Api(kube_client) pod = core_api.read_namespaced_pod(node_id, self.namespace) return pod.status.phase not in ["Running", "Pending"] def node_tags(self, node_id): kube_client = self.__cached[self.cluster_name]["kube_client"] core_api = client.CoreV1Api(kube_client) pod = core_api.read_namespaced_pod(node_id, self.namespace) return pod.metadata.labels def external_ip(self, node_id): raise NotImplementedError("Must use internal IPs with Kubernetes.") def internal_ip(self, node_id): kube_client = self.__cached[self.cluster_name]["kube_client"] core_api = client.CoreV1Api(kube_client) pod = core_api.read_namespaced_pod(node_id, self.namespace) return pod.status.pod_ip def get_node_id(self, ip_address, use_internal_ip=True) -> str: if not use_internal_ip: raise ValueError("Must use internal IPs with Staroid.") return super().get_node_id(ip_address, use_internal_ip=use_internal_ip) def set_node_tags(self, node_id, tags): kube_client = self.__cached[self.cluster_name]["kube_client"] core_api = client.CoreV1Api(kube_client) max_retry = 10 for i in range(max_retry): try: pod = core_api.read_namespaced_pod(node_id, self.namespace) pod.metadata.labels.update(tags) core_api.patch_namespaced_pod(node_id, self.namespace, pod) except ApiException as e: if e.status == 409 and max_retry - 1 > i: # conflict. pod modified before apply patch. retry time.sleep(0.2) continue raise e def create_node(self, node_config, tags, count): instance_name = self.cluster_name incluster = self._connect_kubeapi(instance_name) if incluster is None: # get or create ske cluster_api = self.__star.cluster() ske = cluster_api.create(self.__ske, self.__ske_region) if ske is None: raise Exception( "Failed to create an SKE '{}' in '{}' region".format( self.__ske, self.__ske_region ) ) # create a namespace ns_api = self.__star.namespace(ske) ns = ns_api.create( instance_name, self.provider_config["project"], # Configure 'start-head' param to 'false'. # head node will be created using Kubernetes api. params=[{"group": "Misc", "name": "start-head", "value": "false"}], ) if ns is None: raise Exception( "Failed to create a cluster '{}' in SKE '{}'".format( instance_name, self.__ske ) ) # 'ray down' will change staroid namespace status to "PAUSE" # in this case we need to start namespace again. if ns.status() == "PAUSE": ns = ns_api.start(instance_name) # kube client kube_client = self._connect_kubeapi(instance_name) core_api = client.CoreV1Api(kube_client) apps_api = client.AppsV1Api(kube_client) # retrieve container image image = None if self.provider_config["image_from_project"]: ray_images = apps_api.read_namespaced_deployment( name="ray-images", namespace=self.namespace ) py_ver = self.provider_config["python_version"].replace(".", "-") containers = ray_images.spec.template.spec.containers for c in containers: if py_ver in c.image: image = c.image break logger.info(log_prefix + "use image {}".format(image)) # create head node conf = node_config.copy() pod_spec = conf.get("pod", conf) service_spec = conf.get("service") node_uuid = str(uuid4()) tags[TAG_RAY_CLUSTER_NAME] = self.cluster_name tags["ray-node-uuid"] = node_uuid pod_spec["metadata"]["namespace"] = self.namespace if "labels" in pod_spec["metadata"]: pod_spec["metadata"]["labels"].update(tags) else: pod_spec["metadata"]["labels"] = tags if "generateName" not in pod_spec["metadata"]: pod_spec["metadata"]["generateName"] = ( "ray-" + pod_spec["metadata"]["labels"]["ray-node-type"] + "-" ) if "component" not in pod_spec["metadata"]["labels"]: pod_spec["metadata"]["labels"]["component"] = ( "ray-" + pod_spec["metadata"]["labels"]["ray-node-type"] ) if image is not None: containers = pod_spec["spec"]["containers"] for c in containers: if c["name"] == "ray-node": c["image"] = image node_type = pod_spec["metadata"]["labels"]["ray-node-type"] if node_type == "head": if "STAROID_ACCESS_TOKEN" in os.environ: c["env"].append( { "name": "STAROID_ACCESS_TOKEN", "value": os.environ["STAROID_ACCESS_TOKEN"], } ) if "STAROID_ACCOUNT" in os.environ: c["env"].append( { "name": "STAROID_ACCOUNT", "value": os.environ["STAROID_ACCOUNT"], } ) if "STAROID_SKE" in os.environ: c["env"].append( { "name": "STAROID_SKE", "value": os.environ["STAROID_SKE"], } ) logger.info( log_prefix + "calling create_namespaced_pod (count={}).".format(count) ) new_nodes = [] for _ in range(count): pod = core_api.create_namespaced_pod(self.namespace, pod_spec) new_nodes.append(pod) new_svcs = [] if service_spec is not None: logger.info( log_prefix + "calling create_namespaced_service " "(count={}).".format(count) ) for new_node in new_nodes: metadata = service_spec.get("metadata", {}) metadata["name"] = new_node.metadata.name service_spec["metadata"] = metadata service_spec["spec"]["selector"] = {"ray-node-uuid": node_uuid} svc = core_api.create_namespaced_service(self.namespace, service_spec) new_svcs.append(svc) def terminate_node(self, node_id): logger.info(log_prefix + "calling delete_namespaced_pod") kube_client = self.__cached[self.cluster_name]["kube_client"] core_api = client.CoreV1Api(kube_client) core_api.delete_namespaced_pod(node_id, self.namespace) try: core_api.delete_namespaced_service(node_id, self.namespace) except ApiException: pass if node_id.startswith("ray-head"): # Stop namespace on staroid after remove ray-head node. instance_name = self.cluster_name cluster_api = self.__star.cluster() ske = cluster_api.get(self.__ske) ns_api = self.__star.namespace(ske) ns_api.get(instance_name) del self.__cached[instance_name] ns_api.stop_tunnel(instance_name) ns_api.stop(instance_name) def terminate_nodes(self, node_ids): for node_id in node_ids: self.terminate_node(node_id) def get_command_runner( self, log_prefix, node_id, auth_config, cluster_name, process_runner, use_internal_ip, docker_config=None, ): instance_name = self.cluster_name # initialize connection self._connect_kubeapi(instance_name) command_runner = StaroidCommandRunner( log_prefix, self.namespace, node_id, auth_config, process_runner, self.__cached[cluster_name]["api_server"], ) return command_runner @staticmethod def bootstrap_config(cluster_config): """Bootstraps the cluster config by adding env defaults if needed.""" return cluster_config
class Ods: def __init__(self, staroid=None, ske=None, cache_dir=None): self.__ske = None if staroid == None: self._staroid = Staroid() else: self._staroid = staroid if cache_dir == None: self.__cache_dir = "{}/.ods".format(str(Path.home())) else: self.__cache_dir = cache_dir # configure from env var if "STAROID_SKE" in os.environ: self.__ske = os.environ["STAROID_SKE"] # configure from args if ske != None: self.__ske = ske def create_or_get_cache_dir(self, module=""): "create (if not exists) or return cache dir path for module" cache_dir = "{}/{}".format(self.__cache_dir, module) if not os.path.exists(cache_dir): os.makedirs(cache_dir) return cache_dir def download_chisel_if_not_exists(self): self._staroid.get_chisel_path() def _start_instance_on_staroid(self, instance_name, commit_url): cluster = self._staroid.cluster().get(self.__ske) if cluster == None: raise Exception("Can't get ske cluster") ns_api = self._staroid.namespace(cluster) ns = ns_api.create(instance_name, commit_url) if ns == None: raise Exception("Can't create instance") # if instnace is stopped, restart if ns.status() == "PAUSE": ns_api.start(instance_name) # wait for phase to become RUNNING return self.__wait_for_ns_phase(ns_api, ns, "RUNNING", 600) def _start_tunnel(self, instance_name, tunnels): cluster = self._staroid.cluster().get(self.__ske) if cluster == None: raise Exception("Can't get ske cluster") ns_api = self._staroid.namespace(cluster) ns = ns_api.get(instance_name) ns_api.shell_start(instance_name) ns_api.start_tunnel(instance_name, tunnels) def _stop_tunnel(self, instance_name): cluster = self._staroid.cluster().get(self.__ske) if cluster == None: raise Exception("Can't get ske cluster") ns_api = self._staroid.namespace(cluster) ns_api.stop_tunnel(instance_name) ns_api.shell_stop(instance_name) def _stop_instance_on_staroid(self, instance_name): cluster = self._staroid.cluster().get(self.__ske) if cluster == None: raise Exception("Can't get ske cluster") ns_api = self._staroid.namespace(cluster) ns = ns_api.stop(instance_name) ns = self.__wait_for_ns_phase(ns_api, ns, "PAUSED", 600) return ns def _delete_instance_on_staroid(self, instance_name): cluster = self._staroid.cluster().get(self.__ske) if cluster == None: raise Exception("Can't get ske cluster") ns_api = self._staroid.namespace(cluster) ns = ns_api.delete(instance_name) ns = self.__wait_for_ns_phase(ns_api, ns, "REMOVED", 600) def __wait_for_ns_phase(self, ns_api, ns, phase, timeout): start_time = time.time() sleep_time = 1 max_sleep_time = 7 while ns.phase() != phase: if time.time() - start_time > timeout: raise Exception("Timeout") # sleep time.sleep(sleep_time) if sleep_time < max_sleep_time: sleep_time += 1 # check ns = ns_api.get_by_id(ns.id()) return ns