def start(self): credentials, subscription_id = self._get_credentials() # Cluster name cluster_name = self.config.get("cluster", None) if _is_none_or_blank(cluster_name): cluster_name = self.cluster_name logging.info("Using same cluster name as DSS: {}".format(cluster_name)) # Resource group resource_group = self.config.get('resourceGroup', None) if _is_none_or_blank(resource_group): metadata = get_instance_metadata() resource_group = metadata["compute"]["resourceGroupName"] logging.info("Using same resource group as DSS: {}".format(resource_group)) clusters_client = ContainerServiceClient(credentials, subscription_id) # Get kubeconfig logging.info("Fetching kubeconfig for cluster %s in %s", cluster_name, resource_group) def do_fetch(): return clusters_client.managed_clusters.list_cluster_admin_credentials(resource_group, cluster_name) get_credentials_result = run_and_process_cloud_error(do_fetch) kube_config_content = get_credentials_result.kubeconfigs[0].value.decode('utf8') kube_config_path = os.path.join(os.getcwd(), 'kube_config') with open(kube_config_path, 'w') as f: f.write(kube_config_content) overrides = make_overrides(self.config, yaml.safe_load(kube_config_content), kube_config_path) # Get other cluster infos def do_inspect(): return clusters_client.managed_clusters.get(resource_group, cluster_name) get_cluster_result = run_and_process_cloud_error(do_inspect) return [overrides, {'kube_config_path':kube_config_path, 'cluster':get_cluster_result.as_dict()}]
def run(self, progress_callback): cluster_data, dss_cluster_settings, dss_cluster_config = get_cluster_from_dss_cluster( self.config['clusterId']) kube_config_path = dss_cluster_settings.get_raw()['containerSettings'][ 'executionConfigsGenericOverrides']['kubeConfigPath'] env = os.environ.copy() env['KUBECONFIG'] = kube_config_path cmd = ['kubectl'] + self.config.get('args', []) if not _is_none_or_blank(self.config.get("namespace", "")): cmd = cmd + ["--namespace", self.config.get("namespace", "")] if not _is_none_or_blank(self.config.get( "format", "")) and self.config.get("format", "") != 'none': cmd = cmd + ["-o", self.config.get("format", "")] logging.info("Run : %s" % json.dumps(cmd)) try: out, err = run_with_timeout(cmd, env=env, timeout=20) rv = 0 except KubeCommandException as e: rv = e.rv out = e.out err = e.err out_html = '<div class="alert alert-info"><div>Output</div><pre class="debug" style="max-width: 100%%; max-height: 100%%;">%s</pre></div>' % out err_html = '<div class="alert alert-danger"><div>Error</div><pre class="debug" style="max-width: 100%%; max-height: 100%%;">%s</pre></div>' % err if rv == 0 or _is_none_or_blank(err): return out_html else: return ('<div class="alert alert-danger">Failed with code %s</div>' % rv) + err_html + out_html
def __init__(self, project_id, zone, credentials=None): logging.info("Connect using project_id=%s zone=%s credentials=%s" % (project_id, zone, credentials)) instance_info = get_instance_info() if _is_none_or_blank(project_id): default_project = instance_info["project"] logging.info("No project specified, using {} as default".format( default_project)) self.project_id = default_project else: self.project_id = project_id if _is_none_or_blank(zone): default_zone = instance_info["zone"] logging.info( "No zone specified, using {} as default".format(default_zone)) self.zone = default_zone else: self.zone = zone self.service = discovery.build('container', 'v1', credentials=credentials, cache_discovery=False) self.compute = discovery.build('compute', 'v1', credentials=credentials, cache_discovery=False)
def get_credentials_from_connection_info(connection_info, connection_info_secret): client_id = connection_info.get('clientId', None) tenant_id = connection_info.get('tenantId', None) password = connection_info.get('password', None) if _is_none_or_blank(client_id) or _is_none_or_blank(password) or _is_none_or_blank(tenant_id): raise Exception('Client, password and tenant must all be defined') credentials = ServicePrincipalCredentials(client_id = client_id, secret = password, tenant = tenant_id) return credentials
def _get_credentials(self): connection_info = self.config.get("connectionInfo", None) connection_info_secret = self.plugin_config.get("connectionInfo", None) if not _is_none_or_blank(connection_info) or not _is_none_or_blank(connection_info_secret): logging.warn("Using legacy authentication fields. Clear them to use the new ones.") credentials = get_credentials_from_connection_info(connection_info, connection_info_secret) subscription_id = connection_info.get('subscriptionId', None) else: connection_info_v2 = self.config.get("connectionInfoV2",{"identityType":"default"}) credentials, _ = get_credentials_from_connection_infoV2(connection_info_v2) subscription_id = get_subscription_id(connection_info_v2) return credentials, subscription_id
def create_admin_binding(user_name=None, kube_config_path=None): if _is_none_or_blank(user_name): user_name = get_account() env = os.environ.copy() if not _is_none_or_blank(kube_config_path): env['KUBECONFIG'] = kube_config_path out = subprocess.check_output(["kubectl", "get", "clusterrolebinding", "cluster-admin-binding", "--ignore-not-found"], env=env) if not _is_none_or_blank(out): logging.info("Clusterrolebinding already exist") else: subprocess.check_call(["kubectl", "create", "clusterrolebinding", "cluster-admin-binding", "--clusterrole", "cluster-admin", "--user", user_name], env=env)
def start(self): connection_info = self.config.get("connectionInfo", {}) connection_info_secret = self.plugin_config.get("connectionInfo", {}) subscription_id = connection_info.get('subscriptionId', None) if _is_none_or_blank(subscription_id): raise Exception('Subscription must be defined') credentials = get_credentials_from_connection_info( connection_info, connection_info_secret) clusters_client = ContainerServiceClient(credentials, subscription_id) resource_group_name = self.config.get('resourceGroup', None) if _is_none_or_blank(resource_group_name): raise Exception( "A resource group to put the cluster in is required") cluster_name = self.config.get('cluster', self.cluster_name) logging.info("Fetching kubeconfig for cluster %s in %s" % (cluster_name, resource_group_name)) def do_fetch(): return clusters_client.managed_clusters.list_cluster_admin_credentials( resource_group_name, cluster_name) get_credentials_result = run_and_process_cloud_error(do_fetch) kube_config_content = get_credentials_result.kubeconfigs[ 0].value.decode('utf8') kube_config_path = os.path.join(os.getcwd(), 'kube_config') with open(kube_config_path, 'w') as f: f.write(kube_config_content) overrides = make_overrides(self.config, yaml.safe_load(kube_config_content), kube_config_path) def do_inspect(): return clusters_client.managed_clusters.get( resource_group_name, cluster_name) get_cluster_result = run_and_process_cloud_error(do_inspect) return [ overrides, { 'kube_config_path': kube_config_path, 'cluster': get_cluster_result.as_dict() } ]
def build(self): node_pool = {'config': {}} node_pool['name'] = self.name if self.name is not None else 'node-pool' node_pool[ 'initialNodeCount'] = self.node_count if self.node_count is not None else 3 if self.machine_type is not None: node_pool['config']['machineType'] = self.machine_type if self.disk_type is not None: node_pool['config']['diskType'] = self.disk_type # Add optional GPU accelerator: if self.enable_gpu: logging.info("GPU option enabled.") node_pool['config']['accelerators'] = [{ 'acceleratorCount': self.gpu_count, 'acceleratorType': self.gpu_type }] if self.disk_size_gb is not None and self.disk_size_gb > 0: node_pool['config']['diskSizeGb'] = self.disk_size_gb node_pool['config']['oauthScopes'] = self.oauth_scopes if not _is_none_or_blank(self.service_account): node_pool['config']['serviceAccount'] = self.service_account node_pool["management"] = {"autoUpgrade": True, "autoRepair": True} if self.enable_autoscaling: node_pool['autoscaling'] = { "enabled": True, "minNodeCount": self.min_node_count if self.min_node_count is not None else node_pool['initialNodeCount'], "maxNodeCount": self.max_node_count if self.max_node_count is not None else node_pool['initialNodeCount'] } node_pool["config"]["labels"] = self.nodepool_labels if not _is_none_or_blank(self.settings_valve): valve = json.loads(self.settings_valve) node_pool = _merge_objects(node_pool, valve) if isinstance(self.cluster_builder, ClusterBuilder): self.cluster_builder.with_node_pool(node_pool) elif isinstance(self.cluster_builder, NodePool): return self.cluster_builder.create(node_pool) else: raise Exception("Unreachable")
def get_subscription_id(connection_info): identity_type = connection_info.get('identityType', None) subscription_id = connection_info.get('subscriptionId', None) if (identity_type == 'default' or identity_type == 'service-principal') and not _is_none_or_blank(subscription_id): return subscription_id else: return get_instance_metadata()["compute"]["subscriptionId"]
def get_kube_config(self, cluster_id=None): response = self.get_info() if _is_none_or_blank(cluster_id): cluster_id = self.name logging.info("Response=%s" % json.dumps(response, indent=2)) legacy_auth = response.get("legacyAbac", {}).get("enabled", False) master_auth = response["masterAuth"] endpoint = response["endpoint"] user = {"name": "user-%s" % cluster_id, "user": {}} if legacy_auth: user["user"] = { "client-certificate-data": master_auth["clientCertificate"], "client-key-data": master_auth["clientKey"] } else: user["user"] = { "auth-provider": { "name": "gcp", "config": { "cmd-args": "config config-helper --format=json", "cmd-path": os.path.join(get_sdk_root(), "bin", "gcloud"), "expiry-key": "{.credential.token_expiry}", "token-key": "{.credential.access_token}" } } } cluster = { "name": "cluster-%s" % cluster_id, "cluster": { "certificate-authority-data": master_auth["clusterCaCertificate"], "server": "https://%s" % endpoint } } context = { "name": "context-%s" % cluster_id, "context": { "cluster": cluster["name"], "user": user["name"] } } config = { "apiVersion": "v1", "kind": "Config", "preferences": {}, "clusters": [cluster], "contexts": [context], "users": [user], "current-context": context["name"] } return config
def with_oauth_scopes(self, oauth_scopes): if isinstance(oauth_scopes, text_type): return self.with_oauth_scopes(oauth_scopes.split(',')) if oauth_scopes is not None: for oauth_scope in oauth_scopes: if _is_none_or_blank(oauth_scope): continue self.add_oauth_scope(oauth_scope.strip()) return self
def stop(self, data): connection_info = self.config.get("connectionInfo", {}) connection_info_secret = self.plugin_config.get("connectionInfo", {}) subscription_id = connection_info.get('subscriptionId', None) if _is_none_or_blank(subscription_id): raise Exception('Subscription must be defined') credentials = get_credentials_from_connection_info( connection_info, connection_info_secret) clusters_client = ContainerServiceClient(credentials, subscription_id) resource_group_name = self.config.get('resourceGroup', None) if _is_none_or_blank(resource_group_name): raise Exception( "A resource group to put the cluster in is required") logging.info("Fetching kubeconfig for cluster %s in %s" % (self.cluster_name, resource_group_name)) def do_delete(): return clusters_client.managed_clusters.delete( resource_group_name, self.cluster_name) delete_result = run_and_process_cloud_error(do_delete) # delete returns void, so we poll until the cluster is really gone gone = False while not gone: time.sleep(5) try: cluster = clusters_client.managed_clusters.get( resource_group_name, self.cluster_name) if cluster.provisioning_state.lower() != 'deleting': logging.info( "Cluster is not deleting anymore, must be deleted now (state = %s)" % cluster.provisioning_state) except Exception as e: logging.info("Could not get cluster, should be gone (%s)" % str(e)) gone = True
def get_cluster_from_connection_info(config, plugin_config): """ Return a ContainerServiceClient after authenticating using the connection info. """ connection_info = config.get("connectionInfo", None) connection_info_secret = plugin_config.get("connectionInfo", None) if not _is_none_or_blank(connection_info) or not _is_none_or_blank( connection_info_secret): logging.warn( "Using legacy authentication fields. Clear them to use the new ones." ) credentials = get_credentials_from_connection_info( connection_info, connection_info_secret) subscription_id = connection_info.get('subscriptionId', None) else: connection_info_v2 = config.get("connectionInfoV2", {"identityType": "default"}) credentials, _ = get_credentials_from_connection_infoV2( connection_info_v2) subscription_id = get_subscription_id(connection_info_v2) clusters_client = ContainerServiceClient(credentials, subscription_id) return clusters_client
def make_html(command_outputs): divs = [] for command_output in command_outputs: cmd_html = '<div>Run: %s</div>' % json.dumps(command_output[0]) rv_html = '<div>Returned %s</div>' % command_output[1] out_html = '<div class="alert alert-info"><div>Output</div><pre class="debug" style="max-width: 100%%; max-height: 100%%;">%s</pre></div>' % command_output[2] err_html = '<div class="alert alert-danger"><div>Error</div><pre class="debug" style="max-width: 100%%; max-height: 100%%;">%s</pre></div>' % command_output[3] divs.append(cmd_html) divs.append(rv_html) divs.append(out_html) if command_output[1] != 0 and not _is_none_or_blank(command_output[3]): divs.append(err_html) return '\n'.join(divs).decode('utf8')
def run(self, progress_callback): cluster_data, clusters, dss_cluster_settings, dss_cluster_config = get_cluster_from_dss_cluster( self.config['clusterId']) # retrieve the actual name in the cluster's data if cluster_data is None: raise Exception("No cluster data (not started?)") cluster_def = cluster_data.get("cluster", None) if cluster_def is None: raise Exception("No cluster definition (starting failed?)") cluster_name = cluster_def["name"] resource_group_name = dss_cluster_config['config']['resourceGroup'] # get the object for the cluster, AKS side cluster = clusters.managed_clusters.get(resource_group_name, cluster_name) node_pool_id = self.config.get('nodePoolId', None) node_pool = None for profile in cluster.agent_pool_profiles: if profile.name == node_pool_id or ( _is_none_or_blank(node_pool_id) and len(cluster.agent_pool_profiles) == 1): node_pool = profile if node_pool is None: raise Exception("Unable to find node pool '%s'" % (node_pool_id)) # see aks_scale() in azure-cli code cluster.service_principal_profile = None cluster.aad_profile = None desired_count = self.config['numNodes'] logging.info("Resize to %s" % desired_count) if desired_count == 0: raise Exception("Can't delete node pool '%s'" % (node_pool_id)) else: node_pool.count = desired_count logging.info("Waiting for cluster resize") def do_update(): cluster_update_op = clusters.managed_clusters.create_or_update( resource_group_name, cluster_name, cluster) return cluster_update_op.result() update_result = run_and_process_cloud_error(do_update) logging.info("Cluster updated") return '<pre class="debug">%s</pre>' % json.dumps( update_result.as_dict(), indent=2)
def create_installer_daemonset(kube_config_path=None): """ Launch a pod on each node that will install the NVIDIA drivers. """ env = os.environ.copy() if not _is_none_or_blank(kube_config_path): logging.info( "Setting kube_config path from KUBECONFIG env variable...") env["KUBECONFIG"] = kube_config_path logging.info("Found KUBECONFIG={}".format(env["KUBECONFIG"])) logging.info( "Creating NVIDIA driver daemonset (only GPU-tainted nodes will be affected)" ) subprocess.check_call(["kubectl", "apply", "-f", DAEMONSET_MANIFEST_URL], env=env)
def get_cluster_from_connection_info(config, plugin_config): """ Return a ContainerServiceClient after authenticating using the connection info. """ connection_info = config.get("connectionInfo", {}) connection_info_secret = plugin_config.get("connectionInfo", {}) subscription_id = connection_info.get('subscriptionId', None) if _is_none_or_blank(subscription_id): raise Exception('Subscription must be defined') credentials = get_credentials_from_connection_info(connection_info, connection_info_secret) clusters_client = ContainerServiceClient(credentials, subscription_id) # credit this cluster to Dataiku # clusters_client.config.add_user_agent('pid-fd3813c7-273c-5eec-9221-77323f62a148') return clusters_client
def with_service_account(self, service_account_type, custom_service_account_name): """ Change default service account on cluster nodes. Requires the iam.serviceAccountUser IAM permission. """ if service_account_type == "fromDSSHost": logging.info( "Custer nodes will inherit the DSS host Service Account") self.service_account = get_instance_service_account() if service_account_type == "custom": if _is_none_or_blank(custom_service_account_name): logging.info( "Cluster nodes will have the default Compute Engine Service Account" ) self.service_account = "" else: logging.info( "Cluster nodes will have the custom Service Account: {}". format(custom_service_account_name)) self.service_account = custom_service_account_name return self
def start(self): connection_info = self.config.get('connectionInfo', {}) networking_settings = self.config["networkingSettings"] args = ['create', 'cluster'] args = args + ['-v', '4'] if not self.config.get('advanced'): args = args + ['--name', self.cluster_id] if _has_not_blank_property(connection_info, 'region'): args = args + ['--region', connection_info['region']] elif 'AWS_DEFAULT_REGION' is os.environ: args = args + ['--region', os.environ['AWS_DEFAULT_REGION']] args = args + ['--full-ecr-access'] subnets = networking_settings.get('subnets', []) if networking_settings.get('privateNetworking', False): args = args + ['--node-private-networking'] private_subnets = networking_settings.get('privateSubnets', []) if len(private_subnets) > 0: args = args + [ '--vpc-private-subnets', ','.join(private_subnets) ] if len(subnets) > 0: args = args + ['--vpc-public-subnets', ','.join(subnets)] security_groups = networking_settings.get('securityGroups', []) if len(security_groups) > 0: args = args + [ '--node-security-groups', ','.join(security_groups) ] node_pool = self.config.get('nodePool', {}) if 'machineType' in node_pool: args = args + ['--node-type', node_pool['machineType']] if 'diskType' in node_pool: args = args + ['--node-volume-type', node_pool['diskType']] if 'diskSizeGb' in node_pool and node_pool['diskSizeGb'] > 0: args = args + [ '--node-volume-size', str(node_pool['diskSizeGb']) ] args = args + ['--nodes', str(node_pool.get('numNodes', 3))] if node_pool.get('numNodesAutoscaling', False): args = args + ['--asg-access'] args = args + [ '--nodes-min', str(node_pool.get('minNumNodes', 2)) ] args = args + [ '--nodes-max', str(node_pool.get('maxNumNodes', 5)) ] k8s_version = self.config.get("k8sVersion", None) if not _is_none_or_blank(k8s_version): args = args + ['--version', k8s_version.strip()] else: yaml_dict = yaml.safe_load(self.config.get("advancedYaml")) yaml_loc = os.path.join(os.getcwd(), self.cluster_id + '_advanced.yaml') with open(yaml_loc, 'w') as outfile: yaml.dump(yaml_dict, outfile, default_flow_style=False) args = args + ['-f', yaml_loc] # we don't add the context to the main config file, to not end up with an oversized config, # and because 2 different clusters could be concurrently editing the config file kube_config_path = os.path.join(os.getcwd(), 'kube_config') args = args + ['--kubeconfig', kube_config_path] c = EksctlCommand(args, connection_info) if c.run_and_log() != 0: raise Exception("Failed to start cluster") args = ['get', 'cluster'] args = args + ['--name', self.cluster_id] if _has_not_blank_property(connection_info, 'region'): args = args + ['--region', connection_info['region']] elif 'AWS_DEFAULT_REGION' is os.environ: args = args + ['--region', os.environ['AWS_DEFAULT_REGION']] args = args + ['-o', 'json'] if _has_not_blank_property(connection_info, 'accessKey') and _has_not_blank_property( connection_info, 'secretKey'): creds_in_env = { 'AWS_ACCESS_KEY_ID': connection_info['accessKey'], 'AWS_SECRET_ACCESS_KEY': connection_info['secretKey'] } add_authenticator_env(kube_config_path, creds_in_env) if not self.config.get('advanced'): if node_pool.get('numNodesAutoscaling', False): logging.info("Nodegroup is autoscaling, ensuring autoscaler") add_autoscaler_if_needed(self.cluster_id, kube_config_path) elif self.config.get('clusterAutoScaling'): logging.info("Nodegroup is autoscaling, ensuring autoscaler") add_autoscaler_if_needed(self.cluster_id, kube_config_path) c = EksctlCommand(args, connection_info) cluster_info = json.loads(c.run_and_get_output())[0] with open(kube_config_path, "r") as f: kube_config = yaml.safe_load(f) # collect and prepare the overrides so that DSS can know where and how to use the cluster overrides = make_overrides(self.config, kube_config, kube_config_path) return [ overrides, { 'kube_config_path': kube_config_path, 'cluster': cluster_info } ]
def stop(self, data): credentials, _, _ = self._get_credentials() # Do NOT use the conf but the actual values from the cluster here cluster_resource_id = data["cluster"]["id"] _, _, subscription_id, _, resource_group, _, _, _, cluster_name = cluster_resource_id.split( "/") clusters_client = ContainerServiceClient(credentials, subscription_id) # Try to detach from ACR if required. It is not mandatory but if not done, it would pollute # the ACR with multiple invalid role attachments and consume attachment quotas node_resource_group = data["cluster"]["node_resource_group"] acr_attachment = data.get("acr_attachment", None) if not _is_none_or_blank(acr_attachment): logging.info( "Cluster has an ACR attachment, check managed identity") cluster_identity_profile = data["cluster"]["identity_profile"] kubelet_mi_resource_id = cluster_identity_profile[ "kubeletidentity"].get("resource_id", None) if kubelet_mi_resource_id is not None: _, _, mi_subscription_id, _, mi_resource_group, _, _, _, mi_name = kubelet_mi_resource_id.split( "/") if mi_resource_group == node_resource_group: logging.info( "Cluster has an AKS managed kubelet identity, try to detach" ) authorization_client = AuthorizationManagementClient( credentials, acr_attachment["subscription_id"]) try: authorization_client.role_assignments.delete_by_id( acr_attachment["role_assignment"]["id"]) except ResourceNotFoundError as e: logging.warn( "It looks that the ACR role assignment doesnt exist. Ignore this step" ) # Detach Vnet like ACR vnet_attachment = data.get("vnet_attachment", None) if not _is_none_or_blank(vnet_attachment): logging.info( "Cluster has an Vnet attachment, check managed identity") if "role_assignment" in vnet_attachment: logging.info( "Cluster has an AKS managed kubelet identity, try to detach" ) authorization_client = AuthorizationManagementClient( credentials, vnet_attachment["subscription_id"]) try: authorization_client.role_assignments.delete_by_id( vnet_attachment["role_assignment"]["id"]) except ResourceNotFoundError as e: logging.warn( "It looks that the Vnet role assignment doesnt exist. Ignore this step" ) def do_delete(): future = clusters_client.managed_clusters.begin_delete( resource_group, cluster_name) return future.result() delete_result = run_and_process_cloud_error(do_delete) # delete returns void, so we poll until the cluster is really gone gone = False while not gone: time.sleep(5) try: cluster = clusters_client.managed_clusters.get( resource_group, cluster_name) if cluster.provisioning_state.lower() != 'deleting': logging.info( "Cluster is not deleting anymore, must be deleted now (state = %s)" % cluster.provisioning_state) # other exceptions should not be ignored except ResourceNotFoundError as e: logging.info( "Cluster doesn't seem to exist anymore, considering it deleted" ) gone = True
def start(self): """ Build the create cluster request. """ credentials, subscription_id, managed_identity_id = self._get_credentials( ) # Fetch metadata about the instance metadata = get_instance_metadata() # Resource group resource_group = self.config.get('resourceGroup', None) dss_host_resource_group = metadata["compute"]["resourceGroupName"] if _is_none_or_blank(resource_group): resource_group = dss_host_resource_group logging.info( "Using same resource group as DSS: {}".format(resource_group)) # Location location = self.config.get('location', None) if _is_none_or_blank(location): location = metadata["compute"]["location"] logging.info("Using same location as DSS: {}".format(location)) # Consistency checks if _is_none_or_blank(resource_group): raise Exception( "A resource group to put the cluster in is required") if _is_none_or_blank(location): raise Exception("A location to put the cluster in is required") # AKS Client clusters_client = None # Credit the cluster to DATAIKU if os.environ.get("DISABLE_AZURE_USAGE_ATTRIBUTION", "0") == "1": logging.info("Azure usage attribution is disabled") clusters_client = ContainerServiceClient(credentials, subscription_id) else: policy = UserAgentPolicy() policy.add_user_agent('pid-fd3813c7-273c-5eec-9221-77323f62a148') clusters_client = ContainerServiceClient(credentials, subscription_id, user_agent_policy=policy) # check that the cluster doesn't exist yet, otherwise azure will try to update it # and will almost always fail try: existing = clusters_client.managed_clusters.get( resource_group, self.cluster_name) if existing is not None: raise Exception( "A cluster with name %s in resource group %s already exists" % (self.cluster_name, resource_group)) except CloudError as e: logging.info("Cluster doesn't seem to exist yet") except ResourceNotFoundError as e: logging.info("Cluster doesn't seem to exist yet") cluster_builder = ClusterBuilder(clusters_client) cluster_builder.with_name(self.cluster_name) cluster_builder.with_dns_prefix("{}-dns".format(self.cluster_name)) cluster_builder.with_resource_group(resource_group) cluster_builder.with_location(location) cluster_builder.add_tags(self.config.get("tags", None)) cluster_builder.with_linux_profile() # default is None cluster_builder.with_network_profile( service_cidr=self.config.get("serviceCIDR", None), dns_service_ip=self.config.get("dnsServiceIP", None), load_balancer_sku=self.config.get("loadBalancerSku", None), outbound_type=self.config.get("outboundType", None), network_plugin=self.config.get("networkPlugin"), docker_bridge_cidr=self.config.get("dockerBridgeCidr")) if self.config.get("useCustomNodeResourceGroup", False): cluster_builder.with_node_resource_group( self.config.get("nodeResourceGroup")) # Cluster identity connection_info = self.config.get("connectionInfo", None) cluster_idendity_legacy_use_distinct_sp = self.config.get( "useDistinctSPForCluster", False) cluster_idendity_legacy_sp = self.config.get("clusterServicePrincipal", None) cluster_identity_type = None cluster_identity = None if not _is_none_or_blank( connection_info) or cluster_idendity_legacy_use_distinct_sp: logging.warn( "Using legacy options to configure cluster identity. Clear them to use the new ones." ) if not cluster_idendity_legacy_use_distinct_sp and not _is_none_or_blank( connection_info): cluster_sp = connection_info elif cluster_idendity_legacy_use_distinct_sp and not _is_none_or_blank( cluster_idendity_legacy_sp): cluster_sp = self.config.get("clusterServicePrincipal") else: raise Exception( "Legacy options are not complete enough to determine cluster identity settings" ) cluster_builder.with_cluster_sp_legacy( cluster_service_principal_connection_info=cluster_sp) else: cluster_identity = self.config.get( "clusterIdentity", {"identityType": "managed-identity"}) cluster_identity_type = cluster_identity.get( "identityType", "managed-identity") if cluster_identity_type == "managed-identity": if cluster_identity.get("inheritDSSIdentity", True): logging.info( "Need to inspect Managed Identity infos from Azure") if metadata is None: metadata = get_instance_metadata() vm_resource_group = metadata["compute"][ "resourceGroupName"] vm_name = metadata["compute"]["name"] compute_client = ComputeManagementClient( credentials, subscription_id) vm = compute_client.virtual_machines.get( vm_resource_group, vm_name) # No choice here but to use the first one if managed_identity_id is None: managed_identity_id = next( iter(vm.identity.user_assigned_identities.keys())) for managed_identity_resource_id, managed_identity_properties in vm.identity.user_assigned_identities.items( ): if managed_identity_id == managed_identity_resource_id or managed_identity_id == managed_identity_properties.client_id: break logging.info("Found managed identity id {}".format( managed_identity_resource_id)) cluster_builder.with_managed_identity( managed_identity_resource_id) cluster_builder.with_kubelet_identity( managed_identity_resource_id, managed_identity_properties.client_id, managed_identity_properties.principal_id) else: control_plane_mi = None if cluster_identity.get( "useAKSManagedIdentity", True ) else cluster_identity["controlPlaneUserAssignedIdentity"] cluster_builder.with_managed_identity(control_plane_mi) if control_plane_mi is None: logging.info( "Configure cluster with system managed identity.") else: logging.info( "Configure cluster with user assigned identity: {}" .format(control_plane_mi)) if not cluster_identity.get("useAKSManagedKubeletIdentity", True): kubelet_mi = cluster_identity[ "kubeletUserAssignedIdentity"] _, _, mi_subscription_id, _, mi_resource_group, _, _, _, mi_name = kubelet_mi.split( "/") msiclient = ManagedServiceIdentityClient( AzureIdentityCredentialAdapter(credentials), mi_subscription_id) mi = msiclient.user_assigned_identities.get( mi_resource_group, mi_name) cluster_builder.with_kubelet_identity( kubelet_mi, mi.client_id, mi.principal_id) logging.info( "Configure kubelet identity with user assigned identity resourceId=\"{}\", clientId=\"{}\", objectId=\"{}\"" .format(kubelet_mi, mi.client_id, mi.principal_id)) elif cluster_identity_type == "service-principal": cluster_builder.with_cluster_sp(cluster_identity["clientId"], cluster_identity["password"]) logging.info("Configure cluster with service principal") else: raise Exception( "Cluster identity type \"{}\" is unknown".format( cluster_identity_type)) # Fail fast for non existing ACRs to avoid drama in case of failure AFTER cluster is created acr_role_id = None authorization_client = None if cluster_identity_type is not None and cluster_identity is not None: if cluster_identity_type == "managed-identity" and cluster_identity.get( "useAKSManagedKubeletIdentity", True) and not cluster_identity.get("inheritDSSIdentity", True): acr_name = cluster_identity.get("attachToACRName", None) if not _is_none_or_blank(acr_name): # build acr scope acr_identifier_splitted = acr_name.split('/') acr_subscription_id = subscription_id acr_resource_group = resource_group if 9 == len(acr_identifier_splitted): _, _, acr_subscription_id, _, acr_resource_group, _, _, _, acr_name = acr_identifier_splitted elif 2 == len(acr_identifier_splitted): acr_resource_group, acr_name = acr_identifier_splitted authorization_client = AuthorizationManagementClient( credentials, acr_subscription_id) acr_scope = "/subscriptions/{acr_subscription_id}/resourceGroups/{acr_resource_group}/providers/Microsoft.ContainerRegistry/registries/{acr_name}".format( **locals()) try: acr_roles = list( authorization_client.role_definitions.list( acr_scope, "roleName eq 'AcrPull'")) except ResourceNotFoundError as e: raise Exception( "ACR {} not found. Check it exists and you are Owner of it." .format(acr_scope)) if 0 == len(acr_roles): raise Exception( "Could not find the AcrPull role on the ACR {}. Check you are Owner of it." .format(acr_scope)) else: acr_role_id = acr_roles[0].id logging.info("ACR pull role id: %s", acr_role_id) # Try to run a fake role assignment. Depending on the failure type we know if we are Owner or not try: fake_role_assignment = authorization_client.role_assignments.create( scope=acr_scope, role_assignment_name=str(uuid.uuid4()), parameters={ "properties": { "role_definition_id": acr_role_id, "principal_id": "00000000-0000-0000-0000-000000000000", }, }, ) except HttpResponseError as e: if e.reason == "Forbidden" and "AuthorizationFailed" in str( e.error): raise Exception( "Cannot create role assignments on ACR {}. Check that your are Owner of it or provide an existing Kubelet identity." .format(acr_scope)) elif e.reason == "Bad Request" and "PrincipalNotFound" in str( e.error): logging.info( "Fake role assignment on ACR looks ok. Identity should be allowed to assign roles in further steps." ) else: raise (e) except Exception as e: raise (e) # Sanity check for node pools node_pool_vnets = set() for idx, node_pool_conf in enumerate(self.config.get("nodePools", [])): node_pool_builder = cluster_builder.get_node_pool_builder() nodepool_vnet = node_pool_conf.get("vnet", None) nodepool_subnet = node_pool_conf.get("subnet", None) vnet, _ = node_pool_builder.resolve_network( inherit_from_host=node_pool_conf.get( "useSameNetworkAsDSSHost"), cluster_vnet=nodepool_vnet, cluster_subnet=nodepool_subnet, connection_info=connection_info, credentials=credentials, resource_group=resource_group, dss_host_resource_group=dss_host_resource_group) node_pool_vnets.add(vnet) if 1 < len(node_pool_vnets): raise Exception( "Node pools must all share the same vnet. Current node pools configuration yields vnets {}." .format(",".join(node_pool_vnets))) elif 0 == len(node_pool_vnets): raise Exception( "You cannot deploy a cluster without any node pool.") # Check role assignments for vnet like on ACR for fail fast if not doable vnet_id = node_pool_vnets.pop() if not vnet_id.startswith("/"): vnet_name = vnet_id vnet_id = "/subscriptions/{subscription_id}/resourceGroups/{resource_group}/providers/Microsoft.Network/virtualNetworks/{vnet_name}".format( **locals()) vnet_role_id = None if cluster_identity_type is not None and cluster_identity is not None: if cluster_identity_type == "managed-identity" and cluster_identity.get( "useAKSManagedIdentity", True) and not cluster_identity.get("inheritDSSIdentity", True): authorization_client = AuthorizationManagementClient( credentials, subscription_id) try: vnet_roles = list( authorization_client.role_definitions.list( vnet_id, "roleName eq 'Contributor'")) except ResourceNotFoundError as e: raise Exception( "Vnet {} not found. Check it exists and you are Owner of it." .format(vnet_id)) if 0 == len(acr_roles): raise Exception( "Could not find the Contributor role on the vnet {}. Check you are Owner of it." .format(vnet_id)) else: vnet_role_id = vnet_roles[0].id logging.info("Vnet contributor role id: %s", acr_role_id) # Try to run a fake role assignment. Depending on the failure type we know if we are Owner or not try: fake_role_assignment = authorization_client.role_assignments.create( scope=vnet_id, role_assignment_name=str(uuid.uuid4()), parameters={ "properties": { "role_definition_id": vnet_role_id, "principal_id": "00000000-0000-0000-0000-000000000000", }, }, ) except HttpResponseError as e: if e.reason == "Forbidden" and "AuthorizationFailed" in str( e.error): raise Exception( "Cannot create role assignments on Vnet {}. Check that your are Owner of it or provide an existing Controle Plane identity." .format(vnet_id)) elif e.reason == "Bad Request" and "PrincipalNotFound" in str( e.error): logging.info( "Fake role assignment on Vnet looks ok. Identity should be allowed to assign roles in further steps." ) else: raise (e) except Exception as e: raise (e) # Access level if self.config.get("privateAccess"): cluster_builder.with_private_access( self.config.get("privateAccess")) cluster_builder.with_cluster_version( self.config.get("clusterVersion", None)) # Node pools for idx, node_pool_conf in enumerate(self.config.get("nodePools", [])): node_pool_builder = cluster_builder.get_node_pool_builder() node_pool_builder.with_idx(idx) node_pool_builder.with_vm_size(node_pool_conf.get("vmSize", None)) vnet = node_pool_conf.get("vnet", None) subnet = node_pool_conf.get("subnet", None) node_pool_builder.with_network( inherit_from_host=node_pool_conf.get( "useSameNetworkAsDSSHost"), cluster_vnet=vnet, cluster_subnet=subnet, connection_info=connection_info, credentials=credentials, resource_group=resource_group, dss_host_resource_group=dss_host_resource_group) node_pool_builder.with_availability_zones( use_availability_zones=node_pool_conf.get( "useAvailabilityZones", True)) node_pool_builder.with_node_count( enable_autoscaling=node_pool_conf.get("autoScaling", False), num_nodes=node_pool_conf.get("numNodes", None), min_num_nodes=node_pool_conf.get("minNumNodes", None), max_num_nodes=node_pool_conf.get("maxNumNodes", None)) node_pool_builder.with_mode( mode=node_pool_conf.get("mode", "Automatic"), system_pods_only=node_pool_conf.get("systemPodsOnly", True)) node_pool_builder.with_disk_size_gb( disk_size_gb=node_pool_conf.get("osDiskSizeGb", 0)) node_pool_builder.with_node_labels( node_pool_conf.get("labels", None)) node_pool_builder.with_node_taints( node_pool_conf.get("taints", None)) node_pool_builder.add_tags(self.config.get("tags", None)) node_pool_builder.add_tags(node_pool_conf.get("tags", None)) node_pool_builder.build() cluster_builder.with_node_pool( node_pool=node_pool_builder.agent_pool_profile) # Run creation logging.info("Start creation of cluster") def do_creation(): cluster_create_op = cluster_builder.build() return cluster_create_op.result() create_result = run_and_process_cloud_error(do_creation) logging.info("Cluster creation finished") # Attach to ACR acr_attachment = {} if cluster_identity_type is not None and cluster_identity is not None: if cluster_identity_type == "managed-identity" and cluster_identity.get( "useAKSManagedKubeletIdentity", True) and not cluster_identity.get("inheritDSSIdentity", True): kubelet_mi_object_id = create_result.identity_profile.get( "kubeletidentity").object_id logging.info("Kubelet Managed Identity object id: %s", kubelet_mi_object_id) if not _is_none_or_blank(acr_role_id): logging.info("Assign ACR pull role id %s to %s", acr_role_id, kubelet_mi_object_id) role_assignment = authorization_client.role_assignments.create( scope=acr_scope, role_assignment_name=str(uuid.uuid4()), parameters={ "properties": { "role_definition_id": acr_role_id, "principal_id": kubelet_mi_object_id, }, }, ) acr_attachment.update({ "name": acr_name, "resource_group": acr_resource_group, "subscription_id": acr_subscription_id, "resource_id": acr_scope, "role_assignment": role_assignment.as_dict(), }) # Attach to VNET to allow LoadBalancers creation vnet_attachment = {} if cluster_identity_type is not None and cluster_identity is not None: if cluster_identity_type == "managed-identity" and cluster_identity.get( "useAKSManagedIdentity", True) and not cluster_identity.get("inheritDSSIdentity", True): # And here we are blocked because we cant get the principal id of a System Assigned Managed Id easily control_plane_object_id = create_result.identity.principal_id logging.info("Controle Plane Managed Identity object id: %s", control_plane_object_id) if not _is_none_or_blank(vnet_role_id): logging.info("Assign Vnet contributolr role id %s to %s", vnet_role_id, control_plane_object_id) vnet_role_assignment = authorization_client.role_assignments.create( scope=vnet_id, role_assignment_name=str(uuid.uuid4()), parameters={ "properties": { "role_definition_id": vnet_role_id, "principal_id": control_plane_object_id, }, }, ) vnet_attachment.update({ "subscription_id": subscription_id, "resource_id": vnet_id, "role_assignment": vnet_role_assignment.as_dict(), }) logging.info("Fetching kubeconfig for cluster {} in {}...".format( self.cluster_name, resource_group)) def do_fetch(): return clusters_client.managed_clusters.list_cluster_admin_credentials( resource_group, self.cluster_name) get_credentials_result = run_and_process_cloud_error(do_fetch) kube_config_content = get_credentials_result.kubeconfigs[ 0].value.decode("utf8") logging.info("Writing kubeconfig file...") kube_config_path = os.path.join(os.getcwd(), "kube_config") with open(kube_config_path, 'w') as f: f.write(kube_config_content) overrides = make_overrides( self.config, yaml.safe_load(kube_config_content), kube_config_path, acr_name=None if _is_none_or_blank(acr_attachment) else acr_attachment["name"], ) return [ overrides, { "kube_config_path": kube_config_path, "cluster": create_result.as_dict(), "acr_attachment": acr_attachment, "vnet_attachment": vnet_attachment } ]
def start(self): """ Build the create cluster request. """ connection_info = self.config.get("connectionInfo", {}) connection_info_secret = self.plugin_config.get("connectionInfo", {}) credentials = get_credentials_from_connection_info( connection_info, connection_info_secret) subscription_id = connection_info.get('subscriptionId', None) resource_group = self.config.get('resourceGroup', None) clusters_client = ContainerServiceClient(credentials, subscription_id) # Credit the cluster to DATAIKU if os.environ.get("DISABLE_AZURE_USAGE_ATTRIBUTION", "0") == "1": logging.info("Azure usage attribution is disabled") else: clusters_client.config.add_user_agent( 'pid-fd3813c7-273c-5eec-9221-77323f62a148') resource_group_name = self.config.get('resourceGroup', None) # TODO: Auto detection #if _is_none_or_blank(resource_group_name): # resource_group_name = vm_infos.get('resource_group_name', None) if _is_none_or_blank(resource_group_name): raise Exception( "A resource group to put the cluster in is required") location = self.config.get('location', None) # TODO: Auto detection #if _is_none_or_blank(location): # location = vm_infos.get('location', None) if _is_none_or_blank(location): raise Exception("A location to put the cluster in is required") # check that the cluster doesn't exist yet, otherwise azure will try to update it # and will almost always fail try: existing = clusters_client.managed_clusters.get( resource_group_name, self.cluster_name) if existing is not None: raise Exception( "A cluster with name %s in resource group %s already exists" % (self.cluster_name, resource_group_name)) except CloudError as e: logging.info("Cluster doesn't seem to exist yet") cluster_builder = ClusterBuilder(clusters_client) cluster_builder.with_name(self.cluster_name) cluster_builder.with_dns_prefix("{}-dns".format(self.cluster_name)) cluster_builder.with_resource_group(resource_group) cluster_builder.with_location(self.config.get("location", None)) cluster_builder.with_linux_profile() # default is None cluster_builder.with_network_profile( service_cidr=self.config.get("serviceCIDR", None), dns_service_ip=self.config.get("dnsServiceIP", None), load_balancer_sku=self.config.get("loadBalancerSku", None)) if self.config.get("useDistinctSPForCluster", False): cluster_sp = self.config.get("clusterServicePrincipal") else: cluster_sp = connection_info cluster_builder.with_cluster_sp( cluster_service_principal_connection_info=cluster_sp) cluster_builder.with_cluster_version( self.config.get("clusterVersion", None)) for idx, node_pool_conf in enumerate(self.config.get("nodePools", [])): node_pool_builder = cluster_builder.get_node_pool_builder() node_pool_builder.with_idx(idx) node_pool_builder.with_vm_size(node_pool_conf.get("vmSize", None)) vnet = node_pool_conf.get("vnet", None) subnet = node_pool_conf.get("subnet", None) node_pool_builder.with_network( inherit_from_host=node_pool_conf.get( "useSameNetworkAsDSSHost"), cluster_vnet=vnet, cluster_subnet=subnet, connection_info=connection_info, credentials=credentials, resource_group=resource_group) node_pool_builder.with_node_count( enable_autoscaling=node_pool_conf.get("autoScaling", False), num_nodes=node_pool_conf.get("numNodes", None), min_num_nodes=node_pool_conf.get("minNumNodes", None), max_num_nodes=node_pool_conf.get("maxNumNodes", None)) node_pool_builder.with_disk_size_gb( disk_size_gb=node_pool_conf.get("osDiskSizeGb", 0)) node_pool_builder.build() cluster_builder.with_node_pool( node_pool=node_pool_builder.agent_pool_profile) def do_creation(): cluster_create_op = cluster_builder.build() return cluster_create_op.result() create_result = run_and_process_cloud_error(do_creation) logging.info("Fetching kubeconfig for cluster {} in {}...".format( self.cluster_name, resource_group)) def do_fetch(): return clusters_client.managed_clusters.list_cluster_admin_credentials( resource_group, self.cluster_name) get_credentials_result = run_and_process_cloud_error(do_fetch) kube_config_content = get_credentials_result.kubeconfigs[ 0].value.decode("utf8") logging.info("Writing kubeconfig file...") kube_config_path = os.path.join(os.getcwd(), "kube_config") with open(kube_config_path, 'w') as f: f.write(kube_config_content) overrides = make_overrides(self.config, yaml.safe_load(kube_config_content), kube_config_path) return [ overrides, { "kube_config_path": kube_config_path, "cluster": create_result.as_dict() } ]
def build(self): cluster_name = self.name cluster_version = self.version cluster_node_count = self.node_count cluster_network = self.network cluster_subnetwork = self.subnetwork cluster_labels = self.labels cluster_pod_ip_range = self.pod_ip_range cluster_svc_ip_range = self.svc_ip_range if _is_none_or_blank(cluster_name): cluster_name = self._auto_name() if cluster_node_count is None: cluster_node_count = 3 create_cluster_request_body = { "cluster": { "name": cluster_name, "initialClusterVersion": cluster_version, "description": "Created from plugin", "network": cluster_network, "subnetwork": cluster_subnetwork, "resourceLabels": cluster_labels, "nodePools": [] }, "parent": self.clusters.get_location() } if self.is_vpc_native: ip_allocation_policy = { "createSubnetwork": False, "useIpAliases": True, "servicesIpv4CidrBlock": cluster_svc_ip_range, "clusterIpv4CidrBlock": cluster_pod_ip_range, } create_cluster_request_body["cluster"][ "ipAllocationPolicy"] = ip_allocation_policy if self.legacy_auth: create_cluster_request_body["cluster"]["legacyAbac"] = { "enabled": True } need_issue_certificate = False if cluster_version == "latest" or cluster_version == "-": need_issue_certificate = True else: version_chunks = cluster_version.split('.') major_version = int(version_chunks[0]) minor_version = int(version_chunks[1]) need_issue_certificate = major_version > 1 or ( major_version == 1 and minor_version >= 12) if need_issue_certificate: create_cluster_request_body["cluster"]["masterAuth"] = { "clientCertificateConfig": { "issueClientCertificate": True } } create_cluster_request_body["cluster"]["addonsConfig"] = {} if self.http_load_balancing: create_cluster_request_body["cluster"]["addonsConfig"][ "httpLoadBalancing"] = { "disabled": False } else: create_cluster_request_body["cluster"]["addonsConfig"][ "httpLoadBalancing"] = { "disabled": True } for node_pool in self.node_pools: create_cluster_request_body['cluster']['nodePools'].append( node_pool) if not _is_none_or_blank(self.settings_valve): valve = json.loads(self.settings_valve) create_cluster_request_body["cluster"] = _merge_objects( create_cluster_request_body["cluster"], valve) logging.info("Requesting cluster %s" % json.dumps(create_cluster_request_body, indent=2)) location_params = self.clusters.get_location_params() request = self.clusters.get_clusters_api().create( body=create_cluster_request_body, **location_params) try: response = request.execute() return Operation(response, self.clusters.get_operations_api(), self.clusters.get_location_params()) except HttpError as e: raise Exception("Failed to create cluster : %s" % str(e))