class GkeCluster(KubernetesCluster): AUXILIARY_POOL_NAME = 'default-pool' # This is default pool that is deployed with the cluster POOL_LABEL_NAME = 'cloud.google.com/gke-nodepool' pools: Dict[str, GkeNodePool] def __init__(self, gke_cluster_version, gke_k8s_release_channel, gce_image_type, gce_image_size, gce_network, services, gce_instance_type='n1-standard-4', user_prefix=None, params=None, gce_datacenter=None, cluster_uuid=None, n_nodes=1 ): super().__init__( params=params, cluster_uuid=cluster_uuid, user_prefix=user_prefix ) self.gke_cluster_version = gke_cluster_version self.gke_k8s_release_channel = gke_k8s_release_channel.strip() self.gce_image_type = gce_image_type self.gce_image_size = gce_image_size self.gce_network = gce_network self.gce_services = services self.gce_instance_type = gce_instance_type self.n_nodes = n_nodes self.gce_project = services[0].project self.gce_user = services[0].key self.gce_zone = gce_datacenter[0] self.gke_cluster_created = False self.api_call_rate_limiter = ApiCallRateLimiter( rate_limit=GKE_API_CALL_RATE_LIMIT, queue_size=GKE_API_CALL_QUEUE_SIZE, urllib_retry=GKE_URLLIB_RETRY, urllib_backoff_factor=GKE_URLLIB_BACKOFF_FACTOR, ) self.api_call_rate_limiter.start() def __str__(self): return f"{type(self).__name__} {self.name} | Zone: {self.gce_zone} | Version: {self.gke_cluster_version}" def deploy(self): LOGGER.info("Create GKE cluster `%s' with %d node(s) in %s", self.short_cluster_name, self.n_nodes, self.AUXILIARY_POOL_NAME) tags = ",".join(f"{key}={value}" for key, value in self.tags.items()) with self.gcloud as gcloud: # NOTE: only static K8S release channel supports disabling of autoupgrade gcloud.run(f"container --project {self.gce_project} clusters create {self.short_cluster_name}" f" --no-enable-basic-auth" f" --zone {self.gce_zone}" f" --cluster-version {self.gke_cluster_version}" f"{' --release-channel ' + self.gke_k8s_release_channel if self.gke_k8s_release_channel else ''}" f" --network {self.gce_network}" f" --num-nodes {self.n_nodes}" f" --machine-type {self.gce_instance_type}" f" --image-type UBUNTU" f" --disk-type {self.gce_image_type}" f" --disk-size {self.gce_image_size}" f" --enable-stackdriver-kubernetes" f"{'' if self.gke_k8s_release_channel else ' --no-enable-autoupgrade'}" f"{'' if self.gke_k8s_release_channel else ' --no-enable-autorepair'}" f" --metadata {tags}") self.patch_kubectl_config() self.deploy_node_pool(GkeNodePool( name=self.AUXILIARY_POOL_NAME, num_nodes=self.n_nodes, disk_size=self.gce_image_size, disk_type=self.gce_image_type, k8s_cluster=self, instance_type=self.gce_instance_type, is_deployed=True )) LOGGER.info("Setup RBAC for GKE cluster `%s'", self.name) self.kubectl("create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin " f"--user {self.gce_user}") @cached_property def gcloud(self) -> GcloudContextManager: return cluster.Setup.tester_obj().localhost.gcloud def deploy_node_pool(self, pool: GkeNodePool, wait_till_ready=True) -> None: self._add_pool(pool) if pool.is_deployed: return LOGGER.info("Create %s pool with %d node(s) in GKE cluster `%s'", pool.name, pool.num_nodes, self.name) if wait_till_ready: with self.api_call_rate_limiter.pause: pool.deploy_and_wait_till_ready() self.api_call_rate_limiter.wait_till_api_become_stable(self) else: pool.deploy() def wait_all_node_pools_to_be_ready(self): with self.api_call_rate_limiter.pause: super().wait_all_node_pools_to_be_ready() self.api_call_rate_limiter.wait_till_api_become_stable(self) def resize_node_pool(self, name: str, num_nodes: int) -> None: with self.api_call_rate_limiter.pause: self.pools[name].resize(num_nodes) self.api_call_rate_limiter.wait_till_api_become_stable(self) def get_instance_group_name_for_pool(self, pool_name: str, default=None) -> str: try: group_link = yaml.load( self.gcloud.run( f'container node-pools describe {pool_name} ' f'--zone {self.gce_zone} --project {self.gce_project} ' f'--cluster {self.short_cluster_name}') ).get('instanceGroupUrls')[0] return group_link.split('/')[-1] except Exception as exc: if default is not None: return default raise RuntimeError(f"Can't get instance group name due to the: {exc}") def delete_instance_that_belong_to_instance_group(self, group_name: str, instance_name: str): self.gcloud.run(f'compute instance-groups managed delete-instances {group_name} ' f'--zone={self.gce_zone} --instances={instance_name}') def create_token_update_thread(self): return GcloudTokenUpdateThread(self.gcloud, self.kubectl_token_path) def create_kubectl_config(self): self.gcloud.run(f"container clusters get-credentials {self.short_cluster_name} --zone {self.gce_zone}") def destroy(self): self.api_call_rate_limiter.stop() self.stop_token_update_thread() def deploy_scylla_manager(self, pool_name: str = None) -> None: self.deploy_minio_s3_backend() super().deploy_scylla_manager(pool_name=pool_name)
class GkeCluster(KubernetesCluster): AUXILIARY_POOL_NAME = 'default-pool' # This is default pool that is deployed with the cluster POOL_LABEL_NAME = 'cloud.google.com/gke-nodepool' IS_NODE_TUNING_SUPPORTED = True pools: Dict[str, GkeNodePool] # pylint: disable=too-many-arguments def __init__( self, gke_cluster_version, gke_k8s_release_channel, gce_image_type, gce_image_size, gce_network, services, gce_instance_type='n1-standard-2', user_prefix=None, params=None, gce_datacenter=None, cluster_uuid=None, n_nodes=2, ): super().__init__(params=params, cluster_uuid=cluster_uuid, user_prefix=user_prefix) self.gke_cluster_version = gke_cluster_version self.gke_k8s_release_channel = gke_k8s_release_channel.strip() self.gce_image_type = gce_image_type self.gce_image_size = gce_image_size self.gce_network = gce_network self.gce_services = services self.gce_instance_type = gce_instance_type self.n_nodes = n_nodes self.gce_project = services[0].project self.gce_user = services[0].key self.gce_zone = gce_datacenter[0] self.gke_cluster_created = False self.api_call_rate_limiter = ApiCallRateLimiter( rate_limit=GKE_API_CALL_RATE_LIMIT, queue_size=GKE_API_CALL_QUEUE_SIZE, urllib_retry=GKE_URLLIB_RETRY, urllib_backoff_factor=GKE_URLLIB_BACKOFF_FACTOR, ) self.api_call_rate_limiter.start() @cached_property def allowed_labels_on_scylla_node(self) -> list: allowed_labels_on_scylla_node = [ ('name', 'cpu-policy'), ('app', 'local-volume-provisioner'), ('name', 'raid-local-disks'), ('k8s-app', 'fluentbit-gke'), ('k8s-app', 'gke-metrics-agent'), ('component', 'kube-proxy'), ('k8s-app', 'gcp-compute-persistent-disk-csi-driver'), ('scylla/cluster', self.k8s_scylla_cluster_name), ] if self.is_performance_tuning_enabled: # NOTE: add performance tuning related pods only if we expect it to be. # When we have tuning disabled it must not exist. allowed_labels_on_scylla_node.extend(self.perf_pods_labels) return allowed_labels_on_scylla_node def __str__(self): return f"{type(self).__name__} {self.name} | Zone: {self.gce_zone} | Version: {self.gke_cluster_version}" def deploy(self): LOGGER.info("Create GKE cluster `%s' with %d node(s) in %s", self.short_cluster_name, self.n_nodes, self.AUXILIARY_POOL_NAME) tags = ",".join(f"{key}={value}" for key, value in self.tags.items()) with self.gcloud as gcloud: # NOTE: only static K8S release channel supports disabling of autoupgrade gcloud.run( f"container --project {self.gce_project} clusters create {self.short_cluster_name}" f" --no-enable-basic-auth" f" --zone {self.gce_zone}" f" --cluster-version {self.gke_cluster_version}" f"{' --release-channel ' + self.gke_k8s_release_channel if self.gke_k8s_release_channel else ''}" f" --network {self.gce_network}" f" --num-nodes {self.n_nodes}" f" --machine-type {self.gce_instance_type}" f" --image-type UBUNTU" f" --disk-type {self.gce_image_type}" f" --disk-size {self.gce_image_size}" f" --enable-stackdriver-kubernetes" f"{'' if self.gke_k8s_release_channel else ' --no-enable-autoupgrade'}" f"{'' if self.gke_k8s_release_channel else ' --no-enable-autorepair'}" f" --metadata {tags}") self.patch_kubectl_config() self.deploy_node_pool( GkeNodePool(name=self.AUXILIARY_POOL_NAME, num_nodes=self.n_nodes, disk_size=self.gce_image_size, disk_type=self.gce_image_type, k8s_cluster=self, instance_type=self.gce_instance_type, is_deployed=True)) LOGGER.info("Setup RBAC for GKE cluster `%s'", self.name) self.kubectl( "create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin " f"--user {self.gce_user}") @cached_property def gcloud(self) -> GcloudContextManager: # pylint: disable=no-self-use return self.test_config.tester_obj().localhost.gcloud def deploy_node_pool(self, pool: GkeNodePool, wait_till_ready=True) -> None: self._add_pool(pool) if pool.is_deployed: return LOGGER.info("Create %s pool with %d node(s) in GKE cluster `%s'", pool.name, pool.num_nodes, self.name) if wait_till_ready: with self.api_call_rate_limiter.pause: pool.deploy_and_wait_till_ready() self.api_call_rate_limiter.wait_till_api_become_stable(self) else: pool.deploy() def wait_all_node_pools_to_be_ready(self): with self.api_call_rate_limiter.pause: super().wait_all_node_pools_to_be_ready() self.api_call_rate_limiter.wait_till_api_become_stable(self) def resize_node_pool(self, name: str, num_nodes: int) -> None: with self.api_call_rate_limiter.pause: self.pools[name].resize(num_nodes) self.api_call_rate_limiter.wait_till_api_become_stable(self) def get_instance_group_name_for_pool(self, pool_name: str, default=None) -> str: try: group_link = yaml.safe_load( self.gcloud.run( f'container node-pools describe {pool_name} ' f'--zone {self.gce_zone} --project {self.gce_project} ' f'--cluster {self.short_cluster_name}')).get( 'instanceGroupUrls')[0] return group_link.split('/')[-1] except Exception as exc: if default is not None: return default raise RuntimeError( f"Can't get instance group name due to the: {exc}") from exc def delete_instance_that_belong_to_instance_group(self, group_name: str, instance_name: str): self.gcloud.run( f'compute instance-groups managed delete-instances {group_name} ' f'--zone={self.gce_zone} --instances={instance_name}') def create_token_update_thread(self): return GcloudTokenUpdateThread(self.gcloud, self.kubectl_token_path) def create_kubectl_config(self): self.gcloud.run( f"container clusters get-credentials {self.short_cluster_name} --zone {self.gce_zone}" ) def destroy(self): self.api_call_rate_limiter.stop() self.stop_token_update_thread() def deploy_scylla_manager(self, pool_name: str = None) -> None: self.deploy_minio_s3_backend() super().deploy_scylla_manager(pool_name=pool_name) # NOTE: blocked by https://github.com/scylladb/scylla-operator/issues/760 def upgrade_kubernetes_platform(self) -> str: # NOTE: 'self.gke_cluster_version' can be like 1.21.3-gke.N or 1.21 upgrade_version = f"1.{int(self.gke_cluster_version.split('.')[1]) + 1}" with self.gcloud as gcloud: # Upgrade control plane (API, scheduler, manager and so on ...) LOGGER.info("Upgrading K8S control plane to the '%s' version", upgrade_version) gcloud.run( f"container clusters upgrade {self.short_cluster_name} " f"--master --quiet --project {self.gce_project} --zone {self.gce_zone} " f"--cluster-version {upgrade_version}") # Upgrade scylla-related node pools for node_pool in (self.AUXILIARY_POOL_NAME, self.SCYLLA_POOL_NAME): LOGGER.info("Upgrading '%s' node pool to the '%s' version", node_pool, upgrade_version) # NOTE: one node upgrade takes about 10 minutes gcloud.run( f"container clusters upgrade {self.short_cluster_name} " f"--quiet --project {self.gce_project} --zone {self.gce_zone} " f"--node-pool={node_pool}") return upgrade_version
class GkeCluster(KubernetesCluster, cluster.BaseCluster): def __init__(self, gke_cluster_version, gce_image_type, gce_image_size, gce_network, services, credentials, gce_n_local_ssd=0, gce_instance_type="n1-highmem-8", n_nodes=3, user_prefix=None, params=None, gce_datacenter=None): cluster_prefix = cluster.prepend_user_prefix(user_prefix, "k8s-gke") node_prefix = cluster.prepend_user_prefix(user_prefix, "node") self._gcloud_token_thread = None self.gke_cluster_version = gke_cluster_version self.gce_image_type = gce_image_type self.gce_image_size = gce_image_size self.gce_network = gce_network self.gce_services = services self.credentials = credentials self.gce_instance_type = gce_instance_type self.gce_n_local_ssd = int(gce_n_local_ssd) if gce_n_local_ssd else 0 self.gce_project = services[0].project self.gce_user = services[0].key self.gce_zone = gce_datacenter[0] self.gke_cluster_created = False self.api_call_rate_limiter = ApiCallRateLimiter( rate_limit=GKE_API_CALL_RATE_LIMIT, queue_size=GKE_API_CALL_QUEUE_SIZE, urllib_retry=GKE_URLLIB_RETRY ) self.api_call_rate_limiter.start() super().__init__(cluster_prefix=cluster_prefix, node_prefix=node_prefix, n_nodes=n_nodes, params=params, region_names=gce_datacenter, node_type="scylla-db") @cached_property def gke_cluster_name(self): return shorten_cluster_name(self.name, 40) def __str__(self): return f"{type(self).__name__} {self.name} | Zone: {self.gce_zone} | Version: {self.gke_cluster_version}" def add_nodes(self, count, ec2_user_data='', dc_idx=0, rack=0, enable_auto_bootstrap=False): if not self.gke_cluster_created: self.setup_gke_cluster(num_nodes=count) self.gke_cluster_created = True else: raise NotImplementedError @property def gcloud(self) -> GcloudContextManager: return cluster.Setup.tester_obj().localhost.gcloud def setup_gke_cluster(self, num_nodes: int) -> None: LOGGER.info("Create GKE cluster `%s' with %d node(s) in default-pool and 1 node in operator-pool", self.gke_cluster_name, num_nodes) tags = ",".join(f"{key}={value}" for key, value in self.tags.items()) with self.gcloud as gcloud: gcloud.run(f"container --project {self.gce_project} clusters create {self.gke_cluster_name}" f" --zone {self.gce_zone}" f" --cluster-version {self.gke_cluster_version}" f" --username admin" f" --network {self.gce_network}" f" --num-nodes {num_nodes}" f" --machine-type {self.gce_instance_type}" f" --image-type UBUNTU" f" --disk-type {self.gce_image_type}" f" --disk-size {self.gce_image_size}" f" --local-ssd-count {self.gce_n_local_ssd}" f" --node-taints role=scylla-clusters:NoSchedule" f" --enable-stackdriver-kubernetes" f" --no-enable-autoupgrade" f" --no-enable-autorepair" f" --metadata {tags}") gcloud.run(f"container --project {self.gce_project} node-pools create operator-pool" f" --zone {self.gce_zone}" f" --cluster {self.gke_cluster_name}" f" --num-nodes 1" f" --machine-type n1-standard-4" f" --image-type UBUNTU" f" --disk-type pd-ssd" f" --disk-size 20" f" --no-enable-autoupgrade" f" --no-enable-autorepair") LOGGER.info("Get credentials for GKE cluster `%s'", self.name) gcloud.run(f"container clusters get-credentials {self.gke_cluster_name} --zone {self.gce_zone}") self.start_gcloud_token_update_thread() self.patch_kube_config() LOGGER.info("Setup RBAC for GKE cluster `%s'", self.name) self.kubectl(f"create clusterrolebinding cluster-admin-binding" f" --clusterrole cluster-admin" f" --user {self.gce_user}") LOGGER.info("Install RAID DaemonSet to GKE cluster `%s'", self.name) self.apply_file(RAID_DAEMONSET, envsubst=False) LOGGER.info("Install CPU policy DaemonSet to GKE cluster `%s'", self.name) self.apply_file(CPU_POLICY_DAEMONSET, envsubst=False) LOGGER.info("Install local volume provisioner to GKE cluster `%s'", self.name) self.helm(f"install local-provisioner provisioner") def add_gke_pool(self, name: str, num_nodes: int, instance_type: str) -> None: LOGGER.info("Create sct-loaders pool with %d node(s) in GKE cluster `%s'", num_nodes, self.name) with self.api_call_rate_limiter.pause: self.gcloud.run(f"container --project {self.gce_project} node-pools create {name}" f" --zone {self.gce_zone}" f" --cluster {self.gke_cluster_name}" f" --num-nodes {num_nodes}" f" --machine-type {instance_type}" f" --image-type UBUNTU" f" --node-taints role=sct-loaders:NoSchedule" f" --no-enable-autoupgrade" f" --no-enable-autorepair") self.kubectl('wait --timeout=15m --all --for=condition=Ready node') def get_kubectl_config_for_user(self, config, username): for user in config["users"]: if user["name"] == username: return user["user"]["auth-provider"]["config"] return None @cached_property def gcloud_token_path(self): return os.path.join(self.logdir, 'gcloud.output') def start_gcloud_token_update_thread(self): self._gcloud_token_thread = GcloudTokenUpdateThread(self.gcloud, self.gcloud_token_path) self._gcloud_token_thread.start() # Wait till GcloudTokenUpdateThread get tokens and dump them to gcloud_token_path wait_for(os.path.exists, timeout=30, step=5, text="Wait for gcloud token", throw_exc=True, path=self.gcloud_token_path) def patch_kube_config(self) -> None: # It assumes that config is already created by gcloud # It patches kube config so that instead of running gcloud each time # we will get it's output from the cache file located at gcloud_token_path # To keep this cache file updated we run GcloudTokenUpdateThread thread kube_config_path = os.path.expanduser(os.environ.get('KUBECONFIG', '~/.kube/config')) user_name = f"gke_{self.gce_project}_{self.gce_zone}_{self.gke_cluster_name}" LOGGER.debug("Patch %s to use dockerized gcloud for auth against GKE cluster `%s'", kube_config_path, self.name) with open(kube_config_path) as kube_config: data = yaml.safe_load(kube_config) user_config = self.get_kubectl_config_for_user(data, user_name) if user_config is None: raise RuntimeError(f"Unable to find configuration for `{user_name}' in ~/.kube/config") user_config["cmd-args"] = self.gcloud_token_path user_config["cmd-path"] = "cat" with open(kube_config_path, "w") as kube_config: yaml.safe_dump(data, kube_config) self.log.debug(f'Patched kubectl config at {kube_config_path} ' f'with static gcloud config from {self.gcloud_token_path}') @cluster.wait_for_init_wrap def wait_for_init(self): LOGGER.info("--- List of nodes in GKE cluster `%s': ---\n%s\n", self.name, self.kubectl("get nodes").stdout) LOGGER.info("--- List of pods in GKE cluster `%s': ---\n%s\n", self.name, self.kubectl("get pods -A").stdout) LOGGER.info("Wait for readiness of all pods in default namespace...") self.kubectl("wait --timeout=15m --all --for=condition=Ready pod", timeout=15*60+10) def destroy(self): self.api_call_rate_limiter.stop() if self._gcloud_token_thread: self._gcloud_token_thread.stop()