def test_retrieve_cluster_template(self, mock_cluster_template_get_by_uuid): expected_context = 'context' expected_cluster_template_uuid = 'ClusterTemplate_uuid' cluster = objects.Cluster({}) cluster.cluster_template_id = expected_cluster_template_uuid utils.retrieve_cluster_template(expected_context, cluster) mock_cluster_template_get_by_uuid.assert_called_once_with( expected_context, expected_cluster_template_uuid)
def test_retrieve_cluster_template(self, mock_cluster_template_get_by_uuid): expected_context = 'context' expected_cluster_template_uuid = 'ClusterTemplate_uuid' cluster = objects.Cluster({}) cluster.cluster_template_id = expected_cluster_template_uuid utils.retrieve_cluster_template(expected_context, cluster) mock_cluster_template_get_by_uuid.assert_called_once_with( expected_context, expected_cluster_template_uuid)
def test_retrieve_cluster_template(self, mock_cluster_template_get_by_uuid): expected_context = 'context' expected_cluster_template_uuid = 'ClusterTemplate_uuid' bay = objects.Bay({}) bay.baymodel_id = expected_cluster_template_uuid utils.retrieve_cluster_template(expected_context, bay) mock_cluster_template_get_by_uuid.assert_called_once_with( expected_context, expected_cluster_template_uuid)
def docker_for_cluster(context, cluster): cluster_template = conductor_utils.retrieve_cluster_template( context, cluster) ca_cert, magnum_key, magnum_cert = None, None, None client_kwargs = dict() if not cluster_template.tls_disabled: (ca_cert, magnum_key, magnum_cert) = cert_manager.create_client_files(cluster) client_kwargs['ca_cert'] = ca_cert.name client_kwargs['client_key'] = magnum_key.name client_kwargs['client_cert'] = magnum_cert.name yield DockerHTTPClient( cluster.api_address, CONF.docker.docker_remote_api_version, CONF.docker.default_timeout, **client_kwargs ) if ca_cert: ca_cert.close() if magnum_key: magnum_key.close() if magnum_cert: magnum_cert.close()
def docker_for_cluster(context, cluster): cluster_template = conductor_utils.retrieve_cluster_template( context, cluster) ca_cert, magnum_key, magnum_cert = None, None, None client_kwargs = dict() if not cluster_template.tls_disabled: (ca_cert, magnum_key, magnum_cert) = cert_manager.create_client_files(cluster, context) client_kwargs['ca_cert'] = ca_cert.name client_kwargs['client_key'] = magnum_key.name client_kwargs['client_cert'] = magnum_cert.name yield DockerHTTPClient( cluster.api_address, CONF.docker.docker_remote_api_version, CONF.docker.default_timeout, **client_kwargs ) if ca_cert: ca_cert.close() if magnum_key: magnum_key.close() if magnum_cert: magnum_cert.close()
def __init__(self, openstack_client, context, cluster, cluster_driver): self.openstack_client = openstack_client self.context = context self.cluster = cluster self.cluster_template = conductor_utils.retrieve_cluster_template( self.context, cluster) self.template_def = cluster_driver.get_template_definition()
def __init__(self, openstack_client, context, cluster, cluster_driver): self.openstack_client = openstack_client self.context = context self.cluster = cluster self.cluster_template = conductor_utils.retrieve_cluster_template( self.context, cluster) self.template_def = cluster_driver.get_template_definition()
def rotate_ca_certificate(self, context, cluster): cluster_template = conductor_utils.retrieve_cluster_template(context, cluster) if cluster_template.cluster_distro not in ["fedora-coreos"]: raise exception.NotSupported("Rotating the CA certificate is " "not supported for cluster with " "cluster_distro: %s." % cluster_template.cluster_distro) osc = clients.OpenStackClients(context) rollback = True heat_params = {} csr_keys = x509.generate_csr_and_key(u"Kubernetes Service Account") heat_params['kube_service_account_key'] = \ csr_keys["public_key"].replace("\n", "\\n") heat_params['kube_service_account_private_key'] = \ csr_keys["private_key"].replace("\n", "\\n") fields = { 'existing': True, 'parameters': heat_params, 'disable_rollback': not rollback } osc.heat().stacks.update(cluster.stack_id, **fields)
def cluster_update(self, context, cluster, node_count, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) allow_update_status = (fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Updating a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) # Updates will be only reflected to the default worker # nodegroup. worker_ng = cluster.default_ng_worker if worker_ng.node_count == node_count: return # Backup the old node count so that we can restore it # in case of an exception. old_node_count = worker_ng.node_count manager = scale_manager.get_scale_manager(context, osc, cluster) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Update cluster try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) worker_ng.node_count = node_count worker_ng.save() cluster_driver.update_cluster(context, cluster, manager, rollback) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() # Restore the node_count worker_ng.node_count = old_node_count worker_ng.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster
def _extract_template_definition(self, context, cluster, scale_manager=None): cluster_template = conductor_utils.retrieve_cluster_template(context, cluster) definition = self.get_template_definition() return definition.extract_definition(context, cluster_template, cluster, scale_manager=scale_manager)
def _extract_template_definition(self, context, cluster, scale_manager=None): cluster_template = conductor_utils.retrieve_cluster_template(context, cluster) definition = self.get_template_definition() return definition.extract_definition(context, cluster_template, cluster, scale_manager=scale_manager)
def cluster_upgrade(self, context, cluster, cluster_template, max_batch_size, nodegroup, rollback=False): LOG.debug('cluster_conductor cluster_upgrade') # osc = clients.OpenStackClients(context) allow_update_status = (fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE, cluster) operation = _('Upgrading a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Upgrade cluster try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING, cluster) cluster_driver.upgrade_cluster(context, cluster, cluster_template, max_batch_size, nodegroup, rollback) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() nodegroup.status = fields.ClusterStatus.UPDATE_FAILED nodegroup.status_reason = six.text_type(e) nodegroup.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE, cluster) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise nodegroup.save() cluster.save() return cluster
def _extract_template_definition(context, cluster, scale_manager=None): cluster_template = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = Driver().get_driver(cluster_template.server_type, cluster_template.cluster_distro, cluster_template.coe) definition = cluster_driver.get_template_definition() return definition.extract_definition(context, cluster_template, cluster, scale_manager=scale_manager)
def __init__(self, openstack_client, bay): self.openstack_client = openstack_client self.context = self.openstack_client.context self.bay = bay self.attempts = 0 self.cluster_template = conductor_utils.retrieve_cluster_template( self.context, bay) self.template_def = TDef.get_template_definition( self.cluster_template.server_type, self.cluster_template.cluster_distro, self.cluster_template.coe)
def _extract_template_definition(context, bay, scale_manager=None): cluster_template = conductor_utils.retrieve_cluster_template(context, bay) cluster_distro = cluster_template.cluster_distro cluster_coe = cluster_template.coe cluster_server_type = cluster_template.server_type definition = TDef.get_template_definition(cluster_server_type, cluster_distro, cluster_coe) return definition.extract_definition(context, cluster_template, bay, scale_manager=scale_manager)
def _install_addons(self, cluster, cluster_kubectl, context): """Install add-on services. Including Calico, kube-proxy, CoreDNS """ LOG.info("Starting to install add-ons for cluster %s", cluster.uuid) # Add initializing tag for the new cluster. tag_template = self.jinja_env.get_template('addon_tag.yaml.j2') tag_body = tag_template.render( {'namespace': cluster.uuid, 'status': 'initializing'} ) self.kubectl.apply(definition=tag_body) cluster_template = conductor_utils.retrieve_cluster_template( context, cluster ) if cluster_template.network_driver == 'flannel': cluser_pod_ip_range = cluster.labels.get( 'flannel_network_cidr', '10.100.0.0/16' ) if cluster_template.network_driver == 'calico': cluser_pod_ip_range = cluster.labels.get( 'calico_ipv4pool', '192.168.0.0/16' ) cluser_service_ip_range = cluster.labels.get( 'service_cluster_ip_range', '10.97.0.0/16' ) service_ip_net = netaddr.IPNetwork(cluser_service_ip_range) cluster_dns_service_ip = service_ip_net[10] params = { 'apiserver_address': cluster.master_addresses[0], 'cluster_id': cluster.uuid, 'pod_ip_range': cluser_pod_ip_range, 'cluster_dns_service_ip': cluster_dns_service_ip, "kube_version": cluster.labels.get("kube_tag", "v1.14.3"), } LOG.info( 'Installing calico, proxy, coredns for cluster %s', cluster.uuid ) for t in ['calico_node_rbac.yaml.j2', 'calico.yaml.j2', 'kube-proxy.yaml.j2', 'coredns.yaml.j2']: template = self.jinja_env.get_template(t) body = template.render(params) cluster_kubectl.apply(definition=body) # Add initialized tag for the new cluster. tag_template = self.jinja_env.get_template('addon_tag.yaml.j2') tag_body = tag_template.render( {'namespace': cluster.uuid, 'status': 'initialized'} ) self.kubectl.apply(definition=tag_body)
def cluster_update(self, context, cluster, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) allow_update_status = ( fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE ) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Updating a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) delta = cluster.obj_what_changed() if not delta: return cluster manager = scale_manager.get_scale_manager(context, osc, cluster) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Update cluster try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) cluster_driver.update_cluster(context, cluster, manager, rollback) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster
def delete_cluster(self, context, cluster): LOG.info("Starting to delete cluster %s", cluster.uuid) self.pre_delete_cluster(context, cluster) c_template = conductor_utils.retrieve_cluster_template( context, cluster ) # NOTE: The fake fields are only for the yaml file integrity and do not # affect the deletion params = { "namespace": cluster.uuid, "cloud_provider_tag": "fake", "kube_version": "fake", } _delete_manifest = functools.partial(self._delete_manifest, params) LOG.info("Deleting components for cluster %s", cluster.uuid) for tmpl in [ "openstack-cloud-controller-manager.yaml.j2", "kube-scheduler.yaml.j2", "kube-controllermgr.yaml.j2", "kube-apiserver.yaml.j2", "etcd.yaml.j2", "secrets.yaml.j2", "namespace.yaml.j2" ]: _delete_manifest(tmpl) # Delete floating ip if needed. if (self._master_lb_fip_enabled(cluster, c_template) and cluster.api_address): network_client = clients.OpenStackClients(context).neutron() ip = netutils.urlsplit(cluster.api_address).netloc.split(":")[0] fips = network_client.list_floatingips(floating_ip_address=ip) for fip in fips['floatingips']: LOG.info("Deleting floating ip %s for cluster %s", fip["floating_ip_address"], cluster.uuid) network_client.delete_floatingip(fip['id']) # Delete VIP port LOG.info("Deleting ports for cluster %s", cluster.uuid) tag = {"magnum": cluster.uuid} tags = [jsonutils.dumps(tag)] neutron.delete_port_by_tags(context, tags) # Delete Heat stack. if cluster.stack_id: LOG.info("Deleting Heat stack %s for cluster %s", cluster.stack_id, cluster.uuid) self._delete_stack( context, clients.OpenStackClients(context), cluster )
def cluster_delete(self, context, uuid): LOG.debug('cluster_conductor cluster_delete') osc = clients.OpenStackClients(context) cluster = objects.Cluster.get_by_uuid(context, uuid) ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_PENDING, cluster) cluster_driver.delete_cluster(context, cluster) cluster.status = fields.ClusterStatus.DELETE_IN_PROGRESS cluster.status_reason = None except exc.HTTPNotFound: LOG.info( 'The cluster %s was not found during cluster' ' deletion.', cluster.id) try: trust_manager.delete_trustee_and_trust(osc, context, cluster) cert_manager.delete_certificates_from_cluster(cluster, context=context) # delete all cluster's nodegroups for ng in cluster.nodegroups: ng.destroy() cluster.destroy() except exception.ClusterNotFound: LOG.info('The cluster %s has been deleted by others.', uuid) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_SUCCESS, cluster) return None except exc.HTTPConflict: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE, cluster) raise exception.OperationInProgress(cluster_name=cluster.name) except Exception as unexp: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE, cluster) cluster.status = fields.ClusterStatus.DELETE_FAILED cluster.status_reason = six.text_type(unexp) cluster.save() raise cluster.save() return None
def cluster_delete(self, context, uuid): LOG.debug('cluster_conductor cluster_delete') osc = clients.OpenStackClients(context) cluster = objects.Cluster.get_by_uuid(context, uuid) ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_PENDING) cluster_driver.delete_cluster(context, cluster) cluster.status = fields.ClusterStatus.DELETE_IN_PROGRESS cluster.status_reason = None except exc.HTTPNotFound: LOG.info('The cluster %s was not found during cluster' ' deletion.', cluster.id) try: trust_manager.delete_trustee_and_trust(osc, context, cluster) cert_manager.delete_certificates_from_cluster(cluster, context=context) # delete all cluster's nodegroups for ng in cluster.nodegroups: ng.destroy() cluster.destroy() except exception.ClusterNotFound: LOG.info('The cluster %s has been deleted by others.', uuid) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_SUCCESS) return None except exc.HTTPConflict: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) raise exception.OperationInProgress(cluster_name=cluster.name) except Exception as unexp: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) cluster.status = fields.ClusterStatus.DELETE_FAILED cluster.status_reason = six.text_type(unexp) cluster.save() raise cluster.save() return None
def cluster_delete(self, context, uuid): LOG.debug('cluster_heat cluster_delete') osc = clients.OpenStackClients(context) cluster = objects.Cluster.get_by_uuid(context, uuid) ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_PENDING) cluster_driver.delete_stack(context, osc, cluster) except exc.HTTPNotFound: LOG.info( _LI('The stack %s was not found during cluster' ' deletion.'), cluster.stack_id) try: trust_manager.delete_trustee_and_trust(osc, context, cluster) cert_manager.delete_certificates_from_cluster(cluster, context=context) cluster.destroy() except exception.ClusterNotFound: LOG.info(_LI('The cluster %s has been deleted by others.'), uuid) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_SUCCESS) return None except exc.HTTPConflict: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) raise exception.OperationInProgress(cluster_name=cluster.name) except Exception: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) raise cluster.status = fields.ClusterStatus.DELETE_IN_PROGRESS cluster.save() self._poll_and_check(osc, cluster, cluster_driver) return None
def cluster_create(self, context, cluster, create_timeout): LOG.debug('cluster_heat cluster_create') osc = clients.OpenStackClients(context) try: # Create trustee/trust and set them to cluster trust_manager.create_trustee_and_trust(osc, cluster) # Generate certificate and set the cert reference to cluster cert_manager.generate_certificates_to_cluster(cluster, context=context) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_PENDING) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Create cluster created_stack = cluster_driver.create_stack( context, osc, cluster, create_timeout) except Exception as e: cluster.status = fields.ClusterStatus.CREATE_FAILED cluster.status_reason = six.text_type(e) cluster.create() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.stack_id = created_stack['stack']['id'] cluster.status = fields.ClusterStatus.CREATE_IN_PROGRESS cluster.create() self._poll_and_check(osc, cluster, cluster_driver) return cluster
def cluster_create(self, context, cluster, create_timeout): LOG.debug('cluster_heat cluster_create') osc = clients.OpenStackClients(context) try: # Create trustee/trust and set them to cluster trust_manager.create_trustee_and_trust(osc, cluster) # Generate certificate and set the cert reference to cluster cert_manager.generate_certificates_to_cluster(cluster, context=context) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_PENDING) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Create cluster created_stack = cluster_driver.create_stack(context, osc, cluster, create_timeout) except Exception as e: cluster.status = fields.ClusterStatus.CREATE_FAILED cluster.status_reason = six.text_type(e) cluster.create() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.stack_id = created_stack['stack']['id'] cluster.status = fields.ClusterStatus.CREATE_IN_PROGRESS cluster.create() self._poll_and_check(osc, cluster, cluster_driver) return cluster
def cluster_update(self, context, cluster, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) stack = osc.heat().stacks.get(cluster.stack_id) allow_update_status = ( fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE ) if stack.stack_status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Updating a cluster when stack status is ' '"%s"') % stack.stack_status raise exception.NotSupported(operation=operation) delta = cluster.obj_what_changed() if not delta: return cluster manager = scale_manager.get_scale_manager(context, osc, cluster) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Create cluster cluster_driver.update_stack(context, osc, cluster, manager, rollback) self._poll_and_check(osc, cluster, cluster_driver) return cluster
def cluster_update(self, context, cluster, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) stack = osc.heat().stacks.get(cluster.stack_id) allow_update_status = (fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE) if stack.stack_status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Updating a cluster when stack status is ' '"%s"') % stack.stack_status raise exception.NotSupported(operation=operation) delta = cluster.obj_what_changed() if not delta: return cluster manager = scale_manager.get_scale_manager(context, osc, cluster) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Create cluster cluster_driver.update_stack(context, osc, cluster, manager, rollback) self._poll_and_check(osc, cluster, cluster_driver) return cluster
def cluster_resize(self, context, cluster, node_count, nodes_to_remove, nodegroup): LOG.debug('cluster_conductor cluster_resize') osc = clients.OpenStackClients(context) # NOTE(flwang): One of important user cases of /resize API is # supporting the auto scaling action triggered by Kubernetes Cluster # Autoscaler, so there are 2 cases may happen: # 1. API could be triggered very offen # 2. Scale up or down may fail and we would like to offer the ability # that recover the cluster to allow it being resized when last # update failed. allow_update_status = ( fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE, fields.ClusterStatus.UPDATE_FAILED, fields.ClusterStatus.UPDATE_IN_PROGRESS, ) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Resizing a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) resize_manager = scale_manager.get_scale_manager(context, osc, cluster) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Backup the old node count so that we can restore it # in case of an exception. old_node_count = nodegroup.node_count # Resize cluster try: nodegroup.node_count = node_count nodegroup.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) cluster_driver.resize_cluster(context, cluster, resize_manager, node_count, nodes_to_remove, nodegroup) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() nodegroup.node_count = old_node_count nodegroup.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster
def cluster_update(self, context, cluster, node_count, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) allow_update_status = ( fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE ) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Updating a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) # Updates will be only reflected to the default worker # nodegroup. worker_ng = cluster.default_ng_worker if worker_ng.node_count == node_count: return # Backup the old node count so that we can restore it # in case of an exception. old_node_count = worker_ng.node_count manager = scale_manager.get_scale_manager(context, osc, cluster) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Update cluster try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) worker_ng.node_count = node_count worker_ng.save() cluster_driver.update_cluster(context, cluster, manager, rollback) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() # Restore the node_count worker_ng.node_count = old_node_count worker_ng.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster
def create_cluster(self, context, cluster, cluster_create_timeout): LOG.info("Starting to create cluster %s", cluster.uuid) cluster_template = conductor_utils.retrieve_cluster_template( context, cluster ) cluser_service_ip_range = cluster.labels.get( 'service_cluster_ip_range', '10.97.0.0/16' ) if cluster_template.network_driver == 'flannel': cluser_pod_ip_range = cluster.labels.get( 'flannel_network_cidr', '10.100.0.0/16' ) if cluster_template.network_driver == 'calico': cluser_pod_ip_range = cluster.labels.get( 'calico_ipv4pool', '192.168.0.0/16' ) port_info = self._create_vip_port(context, cluster, cluster_template) # This address should be internal IP that other services could # communicate with. self.apiserver_address = port_info["private_ip"] external_apiserver_address = port_info.get("public_ip", port_info["private_ip"]) # The master address is always the private VIP address. cluster.api_address = 'https://%s:6443' % external_apiserver_address master_ng = cluster.default_ng_master setattr(master_ng, "node_addresses", [self.apiserver_address]) master_ng.save() self.public_network_id = ( cluster_template.external_network_id or "public") if not uuidutils.is_uuid_like(self.public_network_id): self.public_network_id = neutron.get_network_id( context, self.public_network_id ) ca_cert = cert_manager.get_cluster_ca_certificate( cluster, context=context ) ca_cert_encoded = base64.b64encode(ca_cert.get_certificate()) ca_key_encoded = base64.b64encode(ca_cert.get_decrypted_private_key()) cloud_provider_enabled = strutils.bool_from_string( cluster.labels.get("cloud_provider_enabled", "true") ) ca_cert_encoded_str = ca_cert_encoded.decode('utf-8') ca_cert_encoded_str = ca_cert_encoded_str.replace("'","") ca_key_encoded_str = ca_key_encoded.decode('utf-8') ca_key_encoded_str = ca_key_encoded_str.replace("'","") params = { "namespace": cluster.uuid, "vip_port_ip": self.apiserver_address, "vip_external_ip": external_apiserver_address, "vip_port_id": port_info["port_id"], "service_ip_range": cluser_service_ip_range, "pod_ip_range": cluser_pod_ip_range, "ca_cert": ca_cert_encoded_str, "ca_key": ca_key_encoded_str, "subnet_id": cluster_template.fixed_subnet, "public_network_id": self.public_network_id, "cloud_provider_enabled": cloud_provider_enabled, "kube_version": cluster.labels.get("kube_tag", "v1.14.3"), "cloud_provider_tag": cluster.labels.get("cloud_provider_tag", "v1.15.0") } # Keystone related info. osc = clients.OpenStackClients(context) params['trustee_user_id'] = cluster.trustee_user_id params['trustee_password'] = cluster.trustee_password if CONF.trust.cluster_user_trust: params['trust_id'] = cluster.trust_id else: params['trust_id'] = "" kwargs = { 'service_type': 'identity', 'interface': CONF.trust.trustee_keystone_interface, 'version': 3 } if CONF.trust.trustee_keystone_region_name: kwargs['region_name'] = CONF.trust.trustee_keystone_region_name params['auth_url'] = osc.url_for(**kwargs).rstrip('/') _apply_manifest = functools.partial(self._apply_manifest, params) LOG.info("Creating namespace for cluster %s", cluster.uuid) _apply_manifest('namespace.yaml.j2') # Create Secret for the new cluster CA and the kube services, the CA # could be referenced by various cluster components. LOG.info("Creating Secrets for cluster %s", cluster.uuid) _apply_manifest('secrets.yaml.j2') # TODO: Wait for all the certificates are ready # etcd Service and StatefulSet LOG.info("Creating etcd service for cluster %s", cluster.uuid) _apply_manifest('etcd.yaml.j2') # apiserver Service and Deployment LOG.info("Creating kube-apiserver for cluster %s", cluster.uuid) _apply_manifest('kube-apiserver.yaml.j2') # Deploy kube-controller-manager LOG.info("Creating kube-controller-manager for cluster %s", cluster.uuid) _apply_manifest('kube-controllermgr.yaml.j2') # Deploy kube-scheduler LOG.info("Creating kube-scheduler for cluster %s", cluster.uuid) _apply_manifest('kube-scheduler.yaml.j2') kubeconfig_path = self._get_kubeconfig( context, cluster, ca_cert_encoded=ca_cert_encoded ) LOG.info( "Kubeconfig created for cluster %s, path: %s", cluster.uuid, kubeconfig_path ) cluster_kubectl = kubectl.KubeCtl( bin="/usr/bin/kubectl", global_flags="--kubeconfig %s" % kubeconfig_path ) LOG.info( "Waiting for all the components up and running for " "cluster %s", cluster.uuid ) self._wait_for_apiserver(cluster.uuid, cluster_kubectl) if cloud_provider_enabled: # Deploy openstack-cloud-controller-manager LOG.info("Creating openstack-cloud-controller-manager for " "cluster %s", cluster.uuid) # Create RBAC for openstack-cloud-controller-manager in the # cluster. _apply_manifest( "openstack-cloud-controller-manager-in-cluster.yaml.j2", cluster_kubectl ) _apply_manifest('openstack-cloud-controller-manager.yaml.j2') # Create bootstrap token and the bootstrap RBAC in the new cluster LOG.info( "Creating bootstrap token and RBAC in the cluster %s", cluster.uuid ) expiration = timeutils.utcnow() + datetime.timedelta(days=1) # For bootstrap token, refer to # https://kubernetes.io/docs/reference/access-authn-authz/bootstrap-tokens/ token_id = self._generate_random_string(6) token_secret = self._generate_random_string(16) bootstrap_params = { "token_id": token_id, "token_secret": token_secret, "expiration": expiration.strftime('%Y-%m-%dT%H:%M:%SZ'), } bootstrap_template = self.jinja_env.get_template('bootstrap.yaml.j2') bootstrap_body = bootstrap_template.render(bootstrap_params) cluster_kubectl.apply(definition=bootstrap_body) self.bootstrap_token = "%s.%s" % (token_id, token_secret) # Grant privilege to 'kubernetes' user so that apiserver can access # to kubelet for operations like logs, exec, etc. # The user name here must be the same with apiserver CN in # secrets.yaml.j2 cluster_kubectl.execute( "create clusterrolebinding kube-apiserver --clusterrole " "cluster-admin --user kubernetes" ) # Starts to create VMs and bootstrap kubelet LOG.info("Creating worker nodes for cluster %s", cluster.uuid) super(Driver, self).create_cluster( context, cluster, cluster_create_timeout )
def update_cluster_status(self, context, cluster): """Updates the cluster status. This method should be finished within the periodic interval(10s). :param context: Admin context. :param cluster: Cluster object. """ if cluster.status == fields.ClusterStatus.CREATE_IN_PROGRESS: if cluster.stack_id is None: return stack_ctx = mag_ctx.make_cluster_context(cluster) os_clients = clients.OpenStackClients(stack_ctx) stack = os_clients.heat().stacks.get( cluster.stack_id, resolve_outputs=False ) if stack.stack_status == fields.ClusterStatus.CREATE_COMPLETE: stack_ctx = mag_ctx.make_cluster_context(cluster) kubeconfig_path = self._get_kubeconfig(stack_ctx, cluster) cluster_kubectl = kubectl.KubeCtl( bin="/usr/bin/kubectl", global_flags="--kubeconfig %s" % kubeconfig_path ) ns = self.kubectl.get("namespace %s" % cluster.uuid) labels = ns['metadata'].get('labels', {}) if not labels.get('magnum.k8s.io/status'): self._install_addons(cluster, cluster_kubectl, context) return if self._workers_ready(cluster, cluster_kubectl): LOG.info( 'Cluster %s is created successfully', cluster.uuid ) # Update the worker addresses in the cluster from the Heat # stack output. stack = os_clients.heat().stacks.get( cluster.stack_id, resolve_outputs=True ) template_def = self.get_template_definition() c_template = conductor_utils.retrieve_cluster_template( context, cluster ) template_def.update_outputs(stack, c_template, cluster) cluster.status = fields.ClusterStatus.CREATE_COMPLETE cluster.save() elif stack.stack_status in ( fields.ClusterStatus.CREATE_FAILED, fields.ClusterStatus.DELETE_FAILED, fields.ClusterStatus.UPDATE_FAILED, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.ROLLBACK_FAILED ): self._sync_cluster_status(cluster, stack) LOG.error('Failed to create cluster %s', cluster.uuid) elif cluster.status == fields.ClusterStatus.DELETE_IN_PROGRESS: # Check if the namespace is deleted. ns_template = self.jinja_env.get_template('namespace.yaml.j2') ns_body = ns_template.render({"namespace": cluster.uuid}) namespaces = self.kubectl.get('namespace') names = [n['metadata']['name'] for n in namespaces] if cluster.uuid not in names: LOG.debug( "Namespace has been deleted for cluster %s", cluster.uuid ) stack_ctx = mag_ctx.make_cluster_context(cluster) os_client = clients.OpenStackClients(stack_ctx) try: trust_manager.delete_trustee_and_trust( os_client, context, cluster ) cert_manager.delete_certificates_from_cluster( cluster, context=context ) cert_manager.delete_client_files(cluster, context=context) except exception.ClusterNotFound: LOG.info( 'The cluster %s has been deleted by others.', cluster.uuid ) LOG.info('Cluster %s has been deleted.', cluster.uuid) cluster.status = fields.ClusterStatus.DELETE_COMPLETE cluster.save()
def cluster_resize(self, context, cluster, node_count, nodes_to_remove, nodegroup): LOG.debug('cluster_conductor cluster_resize') osc = clients.OpenStackClients(context) # NOTE(flwang): One of important user cases of /resize API is # supporting the auto scaling action triggered by Kubernetes Cluster # Autoscaler, so there are 2 cases may happen: # 1. API could be triggered very offen # 2. Scale up or down may fail and we would like to offer the ability # that recover the cluster to allow it being resized when last # update failed. allow_update_status = ( fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE, fields.ClusterStatus.UPDATE_FAILED, fields.ClusterStatus.UPDATE_IN_PROGRESS, ) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE, cluster) operation = _('Resizing a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) resize_manager = scale_manager.get_scale_manager(context, osc, cluster) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Backup the old node count so that we can restore it # in case of an exception. old_node_count = nodegroup.node_count # Resize cluster try: nodegroup.node_count = node_count nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS nodegroup.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING, cluster) cluster_driver.resize_cluster(context, cluster, resize_manager, node_count, nodes_to_remove, nodegroup) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() nodegroup.node_count = old_node_count nodegroup.status = fields.ClusterStatus.UPDATE_FAILED nodegroup.status_reason = six.text_type(e) nodegroup.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE, cluster) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster