Пример #1
0
class GkeCluster(KubernetesCluster):
    AUXILIARY_POOL_NAME = 'default-pool'  # This is default pool that is deployed with the cluster
    POOL_LABEL_NAME = 'cloud.google.com/gke-nodepool'
    pools: Dict[str, GkeNodePool]

    def __init__(self,
                 gke_cluster_version,
                 gke_k8s_release_channel,
                 gce_image_type,
                 gce_image_size,
                 gce_network,
                 services,
                 gce_instance_type='n1-standard-4',
                 user_prefix=None,
                 params=None,
                 gce_datacenter=None,
                 cluster_uuid=None,
                 n_nodes=1
                 ):
        super().__init__(
            params=params,
            cluster_uuid=cluster_uuid,
            user_prefix=user_prefix
        )
        self.gke_cluster_version = gke_cluster_version
        self.gke_k8s_release_channel = gke_k8s_release_channel.strip()
        self.gce_image_type = gce_image_type
        self.gce_image_size = gce_image_size
        self.gce_network = gce_network
        self.gce_services = services
        self.gce_instance_type = gce_instance_type
        self.n_nodes = n_nodes
        self.gce_project = services[0].project
        self.gce_user = services[0].key
        self.gce_zone = gce_datacenter[0]
        self.gke_cluster_created = False

        self.api_call_rate_limiter = ApiCallRateLimiter(
            rate_limit=GKE_API_CALL_RATE_LIMIT,
            queue_size=GKE_API_CALL_QUEUE_SIZE,
            urllib_retry=GKE_URLLIB_RETRY,
            urllib_backoff_factor=GKE_URLLIB_BACKOFF_FACTOR,
        )
        self.api_call_rate_limiter.start()

    def __str__(self):
        return f"{type(self).__name__} {self.name} | Zone: {self.gce_zone} | Version: {self.gke_cluster_version}"

    def deploy(self):
        LOGGER.info("Create GKE cluster `%s' with %d node(s) in %s",
                    self.short_cluster_name, self.n_nodes, self.AUXILIARY_POOL_NAME)
        tags = ",".join(f"{key}={value}" for key, value in self.tags.items())
        with self.gcloud as gcloud:
            # NOTE: only static K8S release channel supports disabling of autoupgrade
            gcloud.run(f"container --project {self.gce_project} clusters create {self.short_cluster_name}"
                       f" --no-enable-basic-auth"
                       f" --zone {self.gce_zone}"
                       f" --cluster-version {self.gke_cluster_version}"
                       f"{' --release-channel ' + self.gke_k8s_release_channel if self.gke_k8s_release_channel else ''}"
                       f" --network {self.gce_network}"
                       f" --num-nodes {self.n_nodes}"
                       f" --machine-type {self.gce_instance_type}"
                       f" --image-type UBUNTU"
                       f" --disk-type {self.gce_image_type}"
                       f" --disk-size {self.gce_image_size}"
                       f" --enable-stackdriver-kubernetes"
                       f"{'' if self.gke_k8s_release_channel else ' --no-enable-autoupgrade'}"
                       f"{'' if self.gke_k8s_release_channel else ' --no-enable-autorepair'}"
                       f" --metadata {tags}")
            self.patch_kubectl_config()
            self.deploy_node_pool(GkeNodePool(
                name=self.AUXILIARY_POOL_NAME,
                num_nodes=self.n_nodes,
                disk_size=self.gce_image_size,
                disk_type=self.gce_image_type,
                k8s_cluster=self,
                instance_type=self.gce_instance_type,
                is_deployed=True
            ))

        LOGGER.info("Setup RBAC for GKE cluster `%s'", self.name)
        self.kubectl("create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin "
                     f"--user {self.gce_user}")

    @cached_property
    def gcloud(self) -> GcloudContextManager:
        return cluster.Setup.tester_obj().localhost.gcloud

    def deploy_node_pool(self, pool: GkeNodePool, wait_till_ready=True) -> None:
        self._add_pool(pool)
        if pool.is_deployed:
            return
        LOGGER.info("Create %s pool with %d node(s) in GKE cluster `%s'", pool.name, pool.num_nodes, self.name)
        if wait_till_ready:
            with self.api_call_rate_limiter.pause:
                pool.deploy_and_wait_till_ready()
                self.api_call_rate_limiter.wait_till_api_become_stable(self)
        else:
            pool.deploy()

    def wait_all_node_pools_to_be_ready(self):
        with self.api_call_rate_limiter.pause:
            super().wait_all_node_pools_to_be_ready()
            self.api_call_rate_limiter.wait_till_api_become_stable(self)

    def resize_node_pool(self, name: str, num_nodes: int) -> None:
        with self.api_call_rate_limiter.pause:
            self.pools[name].resize(num_nodes)
            self.api_call_rate_limiter.wait_till_api_become_stable(self)

    def get_instance_group_name_for_pool(self, pool_name: str, default=None) -> str:
        try:
            group_link = yaml.load(
                self.gcloud.run(
                    f'container node-pools describe {pool_name} '
                    f'--zone {self.gce_zone} --project {self.gce_project} '
                    f'--cluster {self.short_cluster_name}')
            ).get('instanceGroupUrls')[0]
            return group_link.split('/')[-1]
        except Exception as exc:
            if default is not None:
                return default
            raise RuntimeError(f"Can't get instance group name due to the: {exc}")

    def delete_instance_that_belong_to_instance_group(self, group_name: str, instance_name: str):
        self.gcloud.run(f'compute instance-groups managed delete-instances {group_name} '
                        f'--zone={self.gce_zone} --instances={instance_name}')

    def create_token_update_thread(self):
        return GcloudTokenUpdateThread(self.gcloud, self.kubectl_token_path)

    def create_kubectl_config(self):
        self.gcloud.run(f"container clusters get-credentials {self.short_cluster_name} --zone {self.gce_zone}")

    def destroy(self):
        self.api_call_rate_limiter.stop()
        self.stop_token_update_thread()

    def deploy_scylla_manager(self, pool_name: str = None) -> None:
        self.deploy_minio_s3_backend()
        super().deploy_scylla_manager(pool_name=pool_name)
Пример #2
0
class GkeCluster(KubernetesCluster):
    AUXILIARY_POOL_NAME = 'default-pool'  # This is default pool that is deployed with the cluster
    POOL_LABEL_NAME = 'cloud.google.com/gke-nodepool'
    IS_NODE_TUNING_SUPPORTED = True
    pools: Dict[str, GkeNodePool]

    # pylint: disable=too-many-arguments
    def __init__(
        self,
        gke_cluster_version,
        gke_k8s_release_channel,
        gce_image_type,
        gce_image_size,
        gce_network,
        services,
        gce_instance_type='n1-standard-2',
        user_prefix=None,
        params=None,
        gce_datacenter=None,
        cluster_uuid=None,
        n_nodes=2,
    ):
        super().__init__(params=params,
                         cluster_uuid=cluster_uuid,
                         user_prefix=user_prefix)
        self.gke_cluster_version = gke_cluster_version
        self.gke_k8s_release_channel = gke_k8s_release_channel.strip()
        self.gce_image_type = gce_image_type
        self.gce_image_size = gce_image_size
        self.gce_network = gce_network
        self.gce_services = services
        self.gce_instance_type = gce_instance_type
        self.n_nodes = n_nodes
        self.gce_project = services[0].project
        self.gce_user = services[0].key
        self.gce_zone = gce_datacenter[0]
        self.gke_cluster_created = False
        self.api_call_rate_limiter = ApiCallRateLimiter(
            rate_limit=GKE_API_CALL_RATE_LIMIT,
            queue_size=GKE_API_CALL_QUEUE_SIZE,
            urllib_retry=GKE_URLLIB_RETRY,
            urllib_backoff_factor=GKE_URLLIB_BACKOFF_FACTOR,
        )
        self.api_call_rate_limiter.start()

    @cached_property
    def allowed_labels_on_scylla_node(self) -> list:
        allowed_labels_on_scylla_node = [
            ('name', 'cpu-policy'),
            ('app', 'local-volume-provisioner'),
            ('name', 'raid-local-disks'),
            ('k8s-app', 'fluentbit-gke'),
            ('k8s-app', 'gke-metrics-agent'),
            ('component', 'kube-proxy'),
            ('k8s-app', 'gcp-compute-persistent-disk-csi-driver'),
            ('scylla/cluster', self.k8s_scylla_cluster_name),
        ]
        if self.is_performance_tuning_enabled:
            # NOTE: add performance tuning related pods only if we expect it to be.
            #       When we have tuning disabled it must not exist.
            allowed_labels_on_scylla_node.extend(self.perf_pods_labels)
        return allowed_labels_on_scylla_node

    def __str__(self):
        return f"{type(self).__name__} {self.name} | Zone: {self.gce_zone} | Version: {self.gke_cluster_version}"

    def deploy(self):
        LOGGER.info("Create GKE cluster `%s' with %d node(s) in %s",
                    self.short_cluster_name, self.n_nodes,
                    self.AUXILIARY_POOL_NAME)
        tags = ",".join(f"{key}={value}" for key, value in self.tags.items())
        with self.gcloud as gcloud:
            # NOTE: only static K8S release channel supports disabling of autoupgrade
            gcloud.run(
                f"container --project {self.gce_project} clusters create {self.short_cluster_name}"
                f" --no-enable-basic-auth"
                f" --zone {self.gce_zone}"
                f" --cluster-version {self.gke_cluster_version}"
                f"{' --release-channel ' + self.gke_k8s_release_channel if self.gke_k8s_release_channel else ''}"
                f" --network {self.gce_network}"
                f" --num-nodes {self.n_nodes}"
                f" --machine-type {self.gce_instance_type}"
                f" --image-type UBUNTU"
                f" --disk-type {self.gce_image_type}"
                f" --disk-size {self.gce_image_size}"
                f" --enable-stackdriver-kubernetes"
                f"{'' if self.gke_k8s_release_channel else ' --no-enable-autoupgrade'}"
                f"{'' if self.gke_k8s_release_channel else ' --no-enable-autorepair'}"
                f" --metadata {tags}")
            self.patch_kubectl_config()
            self.deploy_node_pool(
                GkeNodePool(name=self.AUXILIARY_POOL_NAME,
                            num_nodes=self.n_nodes,
                            disk_size=self.gce_image_size,
                            disk_type=self.gce_image_type,
                            k8s_cluster=self,
                            instance_type=self.gce_instance_type,
                            is_deployed=True))

        LOGGER.info("Setup RBAC for GKE cluster `%s'", self.name)
        self.kubectl(
            "create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin "
            f"--user {self.gce_user}")

    @cached_property
    def gcloud(self) -> GcloudContextManager:  # pylint: disable=no-self-use
        return self.test_config.tester_obj().localhost.gcloud

    def deploy_node_pool(self,
                         pool: GkeNodePool,
                         wait_till_ready=True) -> None:
        self._add_pool(pool)
        if pool.is_deployed:
            return
        LOGGER.info("Create %s pool with %d node(s) in GKE cluster `%s'",
                    pool.name, pool.num_nodes, self.name)
        if wait_till_ready:
            with self.api_call_rate_limiter.pause:
                pool.deploy_and_wait_till_ready()
                self.api_call_rate_limiter.wait_till_api_become_stable(self)
        else:
            pool.deploy()

    def wait_all_node_pools_to_be_ready(self):
        with self.api_call_rate_limiter.pause:
            super().wait_all_node_pools_to_be_ready()
            self.api_call_rate_limiter.wait_till_api_become_stable(self)

    def resize_node_pool(self, name: str, num_nodes: int) -> None:
        with self.api_call_rate_limiter.pause:
            self.pools[name].resize(num_nodes)
            self.api_call_rate_limiter.wait_till_api_become_stable(self)

    def get_instance_group_name_for_pool(self,
                                         pool_name: str,
                                         default=None) -> str:
        try:
            group_link = yaml.safe_load(
                self.gcloud.run(
                    f'container node-pools describe {pool_name} '
                    f'--zone {self.gce_zone} --project {self.gce_project} '
                    f'--cluster {self.short_cluster_name}')).get(
                        'instanceGroupUrls')[0]
            return group_link.split('/')[-1]
        except Exception as exc:
            if default is not None:
                return default
            raise RuntimeError(
                f"Can't get instance group name due to the: {exc}") from exc

    def delete_instance_that_belong_to_instance_group(self, group_name: str,
                                                      instance_name: str):
        self.gcloud.run(
            f'compute instance-groups managed delete-instances {group_name} '
            f'--zone={self.gce_zone} --instances={instance_name}')

    def create_token_update_thread(self):
        return GcloudTokenUpdateThread(self.gcloud, self.kubectl_token_path)

    def create_kubectl_config(self):
        self.gcloud.run(
            f"container clusters get-credentials {self.short_cluster_name} --zone {self.gce_zone}"
        )

    def destroy(self):
        self.api_call_rate_limiter.stop()
        self.stop_token_update_thread()

    def deploy_scylla_manager(self, pool_name: str = None) -> None:
        self.deploy_minio_s3_backend()
        super().deploy_scylla_manager(pool_name=pool_name)

    # NOTE: blocked by https://github.com/scylladb/scylla-operator/issues/760
    def upgrade_kubernetes_platform(self) -> str:
        # NOTE: 'self.gke_cluster_version' can be like 1.21.3-gke.N or 1.21
        upgrade_version = f"1.{int(self.gke_cluster_version.split('.')[1]) + 1}"

        with self.gcloud as gcloud:
            # Upgrade control plane (API, scheduler, manager and so on ...)
            LOGGER.info("Upgrading K8S control plane to the '%s' version",
                        upgrade_version)
            gcloud.run(
                f"container clusters upgrade {self.short_cluster_name} "
                f"--master --quiet --project {self.gce_project} --zone {self.gce_zone} "
                f"--cluster-version {upgrade_version}")

            # Upgrade scylla-related node pools
            for node_pool in (self.AUXILIARY_POOL_NAME, self.SCYLLA_POOL_NAME):
                LOGGER.info("Upgrading '%s' node pool to the '%s' version",
                            node_pool, upgrade_version)
                # NOTE: one node upgrade takes about 10 minutes
                gcloud.run(
                    f"container clusters upgrade {self.short_cluster_name} "
                    f"--quiet --project {self.gce_project} --zone {self.gce_zone} "
                    f"--node-pool={node_pool}")
        return upgrade_version
Пример #3
0
class GkeCluster(KubernetesCluster, cluster.BaseCluster):

    def __init__(self,
                 gke_cluster_version,
                 gce_image_type,
                 gce_image_size,
                 gce_network,
                 services,
                 credentials,
                 gce_n_local_ssd=0,
                 gce_instance_type="n1-highmem-8",
                 n_nodes=3,
                 user_prefix=None,
                 params=None,
                 gce_datacenter=None):
        cluster_prefix = cluster.prepend_user_prefix(user_prefix, "k8s-gke")
        node_prefix = cluster.prepend_user_prefix(user_prefix, "node")
        self._gcloud_token_thread = None
        self.gke_cluster_version = gke_cluster_version
        self.gce_image_type = gce_image_type
        self.gce_image_size = gce_image_size
        self.gce_network = gce_network
        self.gce_services = services
        self.credentials = credentials
        self.gce_instance_type = gce_instance_type
        self.gce_n_local_ssd = int(gce_n_local_ssd) if gce_n_local_ssd else 0

        self.gce_project = services[0].project
        self.gce_user = services[0].key
        self.gce_zone = gce_datacenter[0]
        self.gke_cluster_created = False

        self.api_call_rate_limiter = ApiCallRateLimiter(
            rate_limit=GKE_API_CALL_RATE_LIMIT,
            queue_size=GKE_API_CALL_QUEUE_SIZE,
            urllib_retry=GKE_URLLIB_RETRY
        )
        self.api_call_rate_limiter.start()

        super().__init__(cluster_prefix=cluster_prefix,
                         node_prefix=node_prefix,
                         n_nodes=n_nodes,
                         params=params,
                         region_names=gce_datacenter,
                         node_type="scylla-db")

    @cached_property
    def gke_cluster_name(self):
        return shorten_cluster_name(self.name, 40)

    def __str__(self):
        return f"{type(self).__name__} {self.name} | Zone: {self.gce_zone} | Version: {self.gke_cluster_version}"

    def add_nodes(self, count, ec2_user_data='', dc_idx=0, rack=0, enable_auto_bootstrap=False):
        if not self.gke_cluster_created:
            self.setup_gke_cluster(num_nodes=count)
            self.gke_cluster_created = True
        else:
            raise NotImplementedError

    @property
    def gcloud(self) -> GcloudContextManager:
        return cluster.Setup.tester_obj().localhost.gcloud

    def setup_gke_cluster(self, num_nodes: int) -> None:
        LOGGER.info("Create GKE cluster `%s' with %d node(s) in default-pool and 1 node in operator-pool",
                    self.gke_cluster_name, num_nodes)
        tags = ",".join(f"{key}={value}" for key, value in self.tags.items())
        with self.gcloud as gcloud:
            gcloud.run(f"container --project {self.gce_project} clusters create {self.gke_cluster_name}"
                       f" --zone {self.gce_zone}"
                       f" --cluster-version {self.gke_cluster_version}"
                       f" --username admin"
                       f" --network {self.gce_network}"
                       f" --num-nodes {num_nodes}"
                       f" --machine-type {self.gce_instance_type}"
                       f" --image-type UBUNTU"
                       f" --disk-type {self.gce_image_type}"
                       f" --disk-size {self.gce_image_size}"
                       f" --local-ssd-count {self.gce_n_local_ssd}"
                       f" --node-taints role=scylla-clusters:NoSchedule"
                       f" --enable-stackdriver-kubernetes"
                       f" --no-enable-autoupgrade"
                       f" --no-enable-autorepair"
                       f" --metadata {tags}")
            gcloud.run(f"container --project {self.gce_project} node-pools create operator-pool"
                       f" --zone {self.gce_zone}"
                       f" --cluster {self.gke_cluster_name}"
                       f" --num-nodes 1"
                       f" --machine-type n1-standard-4"
                       f" --image-type UBUNTU"
                       f" --disk-type pd-ssd"
                       f" --disk-size 20"
                       f" --no-enable-autoupgrade"
                       f" --no-enable-autorepair")

            LOGGER.info("Get credentials for GKE cluster `%s'", self.name)
            gcloud.run(f"container clusters get-credentials {self.gke_cluster_name} --zone {self.gce_zone}")
        self.start_gcloud_token_update_thread()
        self.patch_kube_config()

        LOGGER.info("Setup RBAC for GKE cluster `%s'", self.name)
        self.kubectl(f"create clusterrolebinding cluster-admin-binding"
                     f" --clusterrole cluster-admin"
                     f" --user {self.gce_user}")

        LOGGER.info("Install RAID DaemonSet to GKE cluster `%s'", self.name)
        self.apply_file(RAID_DAEMONSET, envsubst=False)

        LOGGER.info("Install CPU policy DaemonSet to GKE cluster `%s'", self.name)
        self.apply_file(CPU_POLICY_DAEMONSET, envsubst=False)

        LOGGER.info("Install local volume provisioner to GKE cluster `%s'", self.name)
        self.helm(f"install local-provisioner provisioner")

    def add_gke_pool(self, name: str, num_nodes: int, instance_type: str) -> None:
        LOGGER.info("Create sct-loaders pool with %d node(s) in GKE cluster `%s'", num_nodes, self.name)
        with self.api_call_rate_limiter.pause:
            self.gcloud.run(f"container --project {self.gce_project} node-pools create {name}"
                            f" --zone {self.gce_zone}"
                            f" --cluster {self.gke_cluster_name}"
                            f" --num-nodes {num_nodes}"
                            f" --machine-type {instance_type}"
                            f" --image-type UBUNTU"
                            f" --node-taints role=sct-loaders:NoSchedule"
                            f" --no-enable-autoupgrade"
                            f" --no-enable-autorepair")
            self.kubectl('wait --timeout=15m --all --for=condition=Ready node')

    def get_kubectl_config_for_user(self, config, username):
        for user in config["users"]:
            if user["name"] == username:
                return user["user"]["auth-provider"]["config"]
        return None

    @cached_property
    def gcloud_token_path(self):
        return os.path.join(self.logdir, 'gcloud.output')

    def start_gcloud_token_update_thread(self):
        self._gcloud_token_thread = GcloudTokenUpdateThread(self.gcloud, self.gcloud_token_path)
        self._gcloud_token_thread.start()
        # Wait till GcloudTokenUpdateThread get tokens and dump them to gcloud_token_path
        wait_for(os.path.exists, timeout=30, step=5, text="Wait for gcloud token", throw_exc=True,
                 path=self.gcloud_token_path)

    def patch_kube_config(self) -> None:
        # It assumes that config is already created by gcloud
        # It patches kube config so that instead of running gcloud each time
        # we will get it's output from the cache file located at gcloud_token_path
        # To keep this cache file updated we run GcloudTokenUpdateThread thread
        kube_config_path = os.path.expanduser(os.environ.get('KUBECONFIG', '~/.kube/config'))
        user_name = f"gke_{self.gce_project}_{self.gce_zone}_{self.gke_cluster_name}"
        LOGGER.debug("Patch %s to use dockerized gcloud for auth against GKE cluster `%s'", kube_config_path, self.name)

        with open(kube_config_path) as kube_config:
            data = yaml.safe_load(kube_config)
        user_config = self.get_kubectl_config_for_user(data, user_name)

        if user_config is None:
            raise RuntimeError(f"Unable to find configuration for `{user_name}' in ~/.kube/config")

        user_config["cmd-args"] = self.gcloud_token_path
        user_config["cmd-path"] = "cat"

        with open(kube_config_path, "w") as kube_config:
            yaml.safe_dump(data, kube_config)

        self.log.debug(f'Patched kubectl config at {kube_config_path} '
                       f'with static gcloud config from {self.gcloud_token_path}')

    @cluster.wait_for_init_wrap
    def wait_for_init(self):
        LOGGER.info("--- List of nodes in GKE cluster `%s': ---\n%s\n", self.name, self.kubectl("get nodes").stdout)
        LOGGER.info("--- List of pods in GKE cluster `%s': ---\n%s\n", self.name, self.kubectl("get pods -A").stdout)

        LOGGER.info("Wait for readiness of all pods in default namespace...")
        self.kubectl("wait --timeout=15m --all --for=condition=Ready pod", timeout=15*60+10)

    def destroy(self):
        self.api_call_rate_limiter.stop()
        if self._gcloud_token_thread:
            self._gcloud_token_thread.stop()