def cluster_update(self, context, cluster, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) stack = osc.heat().stacks.get(cluster.stack_id) allow_update_status = (fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE) if stack.stack_status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Updating a cluster when stack status is ' '"%s"') % stack.stack_status raise exception.NotSupported(operation=operation) delta = cluster.obj_what_changed() if not delta: return cluster manager = scale_manager.ScaleManager(context, osc, cluster) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) _update_stack(context, osc, cluster, manager, rollback) self._poll_and_check(osc, cluster) return cluster
def cluster_create(self, context, cluster, create_timeout): LOG.debug('cluster_heat cluster_create') osc = clients.OpenStackClients(context) try: # Create trustee/trust and set them to cluster trust_manager.create_trustee_and_trust(osc, cluster) # Generate certificate and set the cert reference to cluster cert_manager.generate_certificates_to_cluster(cluster, context=context) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_PENDING) created_stack = _create_stack(context, osc, cluster, create_timeout) except Exception as e: cluster.status = fields.ClusterStatus.CREATE_FAILED cluster.status_reason = six.text_type(e) cluster.create() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.stack_id = created_stack['stack']['id'] cluster.status = fields.ClusterStatus.CREATE_IN_PROGRESS cluster.create() self._poll_and_check(osc, cluster) return cluster
def update_status(self): LOG.debug("Updating status for cluster %s", self.cluster.id) # get the driver for the cluster cdriver = driver.Driver.get_driver_for_cluster(self.ctx, self.cluster) # ask the driver to sync status cdriver.update_cluster_status(self.ctx, self.cluster) LOG.debug("Status for cluster %s updated to %s (%s)", self.cluster.id, self.cluster.status, self.cluster.status_reason) # status update notifications if self.cluster.status.endswith("_COMPLETE"): conductor_utils.notify_about_cluster_operation( self.ctx, self.status_to_event[self.cluster.status], taxonomy.OUTCOME_SUCCESS, self.cluster) if self.cluster.status.endswith("_FAILED"): conductor_utils.notify_about_cluster_operation( self.ctx, self.status_to_event[self.cluster.status], taxonomy.OUTCOME_FAILURE, self.cluster) # if we're done with it, delete it if self.cluster.status == objects.fields.ClusterStatus.DELETE_COMPLETE: # delete all the nodegroups that belong to this cluster for ng in objects.NodeGroup.list(self.ctx, self.cluster.uuid): ng.destroy() self.cluster.destroy() # end the "loop" raise loopingcall.LoopingCallDone()
def update_status(self): LOG.debug("Updating status for cluster %s", self.cluster.id) # get the driver for the cluster cdriver = driver.Driver.get_driver_for_cluster(self.ctx, self.cluster) # ask the driver to sync status cdriver.update_cluster_status(self.ctx, self.cluster) LOG.debug("Status for cluster %s updated to %s (%s)", self.cluster.id, self.cluster.status, self.cluster.status_reason) # status update notifications if self.cluster.status.endswith("_COMPLETE"): conductor_utils.notify_about_cluster_operation( self.ctx, self.status_to_event[self.cluster.status], taxonomy.OUTCOME_SUCCESS) if self.cluster.status.endswith("_FAILED"): conductor_utils.notify_about_cluster_operation( self.ctx, self.status_to_event[self.cluster.status], taxonomy.OUTCOME_FAILURE) # if we're done with it, delete it if self.cluster.status == objects.fields.ClusterStatus.DELETE_COMPLETE: # delete all the nodegroups that belong to this cluster for ng in objects.NodeGroup.list(self.ctx, self.cluster.uuid): ng.destroy() self.cluster.destroy() # end the "loop" raise loopingcall.LoopingCallDone()
def cluster_create(self, context, cluster, master_count, node_count, create_timeout): LOG.debug('cluster_heat cluster_create') osc = clients.OpenStackClients(context) cluster.status = fields.ClusterStatus.CREATE_IN_PROGRESS cluster.status_reason = None cluster.create() # Master nodegroup master_ng = conductor_utils._get_nodegroup_object(context, cluster, master_count, is_master=True) master_ng.create() # Minion nodegroup minion_ng = conductor_utils._get_nodegroup_object(context, cluster, node_count, is_master=False) minion_ng.create() try: # Create trustee/trust and set them to cluster trust_manager.create_trustee_and_trust(osc, cluster) # Generate certificate and set the cert reference to cluster cert_manager.generate_certificates_to_cluster(cluster, context=context) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_PENDING, cluster) # Get driver cluster_driver = driver.Driver.get_driver_for_cluster( context, cluster) # Create cluster cluster_driver.create_cluster(context, cluster, create_timeout) cluster.save() for ng in cluster.nodegroups: ng.stack_id = cluster.stack_id ng.save() except Exception as e: cluster.status = fields.ClusterStatus.CREATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_FAILURE, cluster) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise return cluster
def cluster_create(self, context, cluster, master_count, node_count, create_timeout): LOG.debug('cluster_heat cluster_create') osc = clients.OpenStackClients(context) cluster.status = fields.ClusterStatus.CREATE_IN_PROGRESS cluster.status_reason = None cluster.create() # Master nodegroup master_ng = conductor_utils._get_nodegroup_object( context, cluster, master_count, is_master=True) master_ng.create() # Minion nodegroup minion_ng = conductor_utils._get_nodegroup_object( context, cluster, node_count, is_master=False) minion_ng.create() try: # Create trustee/trust and set them to cluster trust_manager.create_trustee_and_trust(osc, cluster) # Generate certificate and set the cert reference to cluster cert_manager.generate_certificates_to_cluster(cluster, context=context) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_PENDING) # Get driver cluster_driver = driver.Driver.get_driver_for_cluster(context, cluster) # Create cluster cluster_driver.create_cluster(context, cluster, create_timeout) cluster.save() except Exception as e: cluster.status = fields.ClusterStatus.CREATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise return cluster
def rotate_ca_certificate(self, context, cluster): LOG.info('start rotate_ca_certificate for cluster: %s', cluster.uuid) allow_update_status = (fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE, cluster) operation = _('Updating a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) try: # re-generate the ca certs cert_manager.generate_certificates_to_cluster(cluster, context=context) cluster_driver = driver.Driver.get_driver_for_cluster( context, cluster) cluster_driver.rotate_ca_certificate(context, cluster) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE, cluster) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster
def cluster_update(self, context, cluster, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) stack = osc.heat().stacks.get(cluster.stack_id) allow_update_status = ( fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE ) if stack.stack_status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Updating a cluster when stack status is ' '"%s"') % stack.stack_status raise exception.NotSupported(operation=operation) delta = cluster.obj_what_changed() if not delta: return cluster manager = scale_manager.get_scale_manager(context, osc, cluster) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Create cluster cluster_driver.update_stack(context, osc, cluster, manager, rollback) self._poll_and_check(osc, cluster, cluster_driver) return cluster
def cluster_create(self, context, cluster, create_timeout): LOG.debug('cluster_heat cluster_create') osc = clients.OpenStackClients(context) try: # Create trustee/trust and set them to cluster trust_manager.create_trustee_and_trust(osc, cluster) # Generate certificate and set the cert reference to cluster cert_manager.generate_certificates_to_cluster(cluster, context=context) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_PENDING) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Create cluster created_stack = cluster_driver.create_stack(context, osc, cluster, create_timeout) except Exception as e: cluster.status = fields.ClusterStatus.CREATE_FAILED cluster.status_reason = six.text_type(e) cluster.create() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.stack_id = created_stack['stack']['id'] cluster.status = fields.ClusterStatus.CREATE_IN_PROGRESS cluster.create() self._poll_and_check(osc, cluster, cluster_driver) return cluster
def cluster_update(self, context, cluster, node_count, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) allow_update_status = (fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Updating a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) # Updates will be only reflected to the default worker # nodegroup. worker_ng = cluster.default_ng_worker if worker_ng.node_count == node_count: return # Backup the old node count so that we can restore it # in case of an exception. old_node_count = worker_ng.node_count manager = scale_manager.get_scale_manager(context, osc, cluster) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Update cluster try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) worker_ng.node_count = node_count worker_ng.save() cluster_driver.update_cluster(context, cluster, manager, rollback) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() # Restore the node_count worker_ng.node_count = old_node_count worker_ng.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster
def cluster_delete(self, context, uuid): LOG.debug('cluster_conductor cluster_delete') osc = clients.OpenStackClients(context) cluster = objects.Cluster.get_by_uuid(context, uuid) ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_PENDING, cluster) cluster_driver.delete_cluster(context, cluster) cluster.status = fields.ClusterStatus.DELETE_IN_PROGRESS cluster.status_reason = None except exc.HTTPNotFound: LOG.info( 'The cluster %s was not found during cluster' ' deletion.', cluster.id) try: trust_manager.delete_trustee_and_trust(osc, context, cluster) cert_manager.delete_certificates_from_cluster(cluster, context=context) # delete all cluster's nodegroups for ng in cluster.nodegroups: ng.destroy() cluster.destroy() except exception.ClusterNotFound: LOG.info('The cluster %s has been deleted by others.', uuid) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_SUCCESS, cluster) return None except exc.HTTPConflict: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE, cluster) raise exception.OperationInProgress(cluster_name=cluster.name) except Exception as unexp: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE, cluster) cluster.status = fields.ClusterStatus.DELETE_FAILED cluster.status_reason = six.text_type(unexp) cluster.save() raise cluster.save() return None
def cluster_upgrade(self, context, cluster, cluster_template, max_batch_size, nodegroup, rollback=False): LOG.debug('cluster_conductor cluster_upgrade') # osc = clients.OpenStackClients(context) allow_update_status = (fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE, cluster) operation = _('Upgrading a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Upgrade cluster try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING, cluster) cluster_driver.upgrade_cluster(context, cluster, cluster_template, max_batch_size, nodegroup, rollback) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() nodegroup.status = fields.ClusterStatus.UPDATE_FAILED nodegroup.status_reason = six.text_type(e) nodegroup.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE, cluster) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise nodegroup.save() cluster.save() return cluster
def cluster_delete(self, context, uuid): LOG.debug('cluster_heat cluster_delete') osc = clients.OpenStackClients(context) cluster = objects.Cluster.get_by_uuid(context, uuid) stack_id = cluster.stack_id # NOTE(sdake): This will execute a stack_delete operation. This will # Ignore HTTPNotFound exceptions (stack wasn't present). In the case # that Heat couldn't find the stack representing the cluster, likely a # user has deleted the stack outside the context of Magnum. Therefore # the contents of the cluster are forever lost. # # If the exception is unhandled, the original exception will be raised. try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_PENDING) osc.heat().stacks.delete(stack_id) except exc.HTTPNotFound: LOG.info( _LI('The stack %s was not found during cluster' ' deletion.'), stack_id) try: trust_manager.delete_trustee_and_trust(osc, context, cluster) cert_manager.delete_certificates_from_cluster(cluster, context=context) cluster.destroy() except exception.ClusterNotFound: LOG.info(_LI('The cluster %s has been deleted by others.'), uuid) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_SUCCESS) return None except exc.HTTPConflict: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) raise exception.OperationInProgress(cluster_name=cluster.name) except Exception: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) raise cluster.status = fields.ClusterStatus.DELETE_IN_PROGRESS cluster.save() self._poll_and_check(osc, cluster) return None
def cluster_delete(self, context, uuid): LOG.debug('cluster_heat cluster_delete') osc = clients.OpenStackClients(context) cluster = objects.Cluster.get_by_uuid(context, uuid) stack_id = cluster.stack_id # NOTE(sdake): This will execute a stack_delete operation. This will # Ignore HTTPNotFound exceptions (stack wasn't present). In the case # that Heat couldn't find the stack representing the cluster, likely a # user has deleted the stack outside the context of Magnum. Therefore # the contents of the cluster are forever lost. # # If the exception is unhandled, the original exception will be raised. try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_PENDING) osc.heat().stacks.delete(stack_id) except exc.HTTPNotFound: LOG.info(_LI('The stack %s was not found during cluster' ' deletion.'), stack_id) try: trust_manager.delete_trustee_and_trust(osc, context, cluster) cert_manager.delete_certificates_from_cluster(cluster, context=context) cluster.destroy() except exception.ClusterNotFound: LOG.info(_LI('The cluster %s has been deleted by others.'), uuid) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_SUCCESS) return None except exc.HTTPConflict: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) raise exception.OperationInProgress(cluster_name=cluster.name) except Exception: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) raise cluster.status = fields.ClusterStatus.DELETE_IN_PROGRESS cluster.save() self._poll_and_check(osc, cluster) return None
def cluster_delete(self, context, uuid): LOG.debug('cluster_conductor cluster_delete') osc = clients.OpenStackClients(context) cluster = objects.Cluster.get_by_uuid(context, uuid) ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_PENDING) cluster_driver.delete_cluster(context, cluster) cluster.status = fields.ClusterStatus.DELETE_IN_PROGRESS cluster.status_reason = None except exc.HTTPNotFound: LOG.info('The cluster %s was not found during cluster' ' deletion.', cluster.id) try: trust_manager.delete_trustee_and_trust(osc, context, cluster) cert_manager.delete_certificates_from_cluster(cluster, context=context) # delete all cluster's nodegroups for ng in cluster.nodegroups: ng.destroy() cluster.destroy() except exception.ClusterNotFound: LOG.info('The cluster %s has been deleted by others.', uuid) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_SUCCESS) return None except exc.HTTPConflict: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) raise exception.OperationInProgress(cluster_name=cluster.name) except Exception as unexp: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) cluster.status = fields.ClusterStatus.DELETE_FAILED cluster.status_reason = six.text_type(unexp) cluster.save() raise cluster.save() return None
def cluster_delete(self, context, uuid): LOG.debug('cluster_heat cluster_delete') osc = clients.OpenStackClients(context) cluster = objects.Cluster.get_by_uuid(context, uuid) ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_PENDING) cluster_driver.delete_stack(context, osc, cluster) except exc.HTTPNotFound: LOG.info( _LI('The stack %s was not found during cluster' ' deletion.'), cluster.stack_id) try: trust_manager.delete_trustee_and_trust(osc, context, cluster) cert_manager.delete_certificates_from_cluster(cluster, context=context) cluster.destroy() except exception.ClusterNotFound: LOG.info(_LI('The cluster %s has been deleted by others.'), uuid) conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_SUCCESS) return None except exc.HTTPConflict: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) raise exception.OperationInProgress(cluster_name=cluster.name) except Exception: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE) raise cluster.status = fields.ClusterStatus.DELETE_IN_PROGRESS cluster.save() self._poll_and_check(osc, cluster, cluster_driver) return None
def cluster_update(self, context, cluster, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) allow_update_status = ( fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE ) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Updating a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) delta = cluster.obj_what_changed() if not delta: return cluster manager = scale_manager.get_scale_manager(context, osc, cluster) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Update cluster try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) cluster_driver.update_cluster(context, cluster, manager, rollback) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster
def cluster_resize(self, context, cluster, node_count, nodes_to_remove, nodegroup): LOG.debug('cluster_conductor cluster_resize') osc = clients.OpenStackClients(context) # NOTE(flwang): One of important user cases of /resize API is # supporting the auto scaling action triggered by Kubernetes Cluster # Autoscaler, so there are 2 cases may happen: # 1. API could be triggered very offen # 2. Scale up or down may fail and we would like to offer the ability # that recover the cluster to allow it being resized when last # update failed. allow_update_status = ( fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE, fields.ClusterStatus.UPDATE_FAILED, fields.ClusterStatus.UPDATE_IN_PROGRESS, ) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE, cluster) operation = _('Resizing a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) resize_manager = scale_manager.get_scale_manager(context, osc, cluster) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Backup the old node count so that we can restore it # in case of an exception. old_node_count = nodegroup.node_count # Resize cluster try: nodegroup.node_count = node_count nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS nodegroup.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING, cluster) cluster_driver.resize_cluster(context, cluster, resize_manager, node_count, nodes_to_remove, nodegroup) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() nodegroup.node_count = old_node_count nodegroup.status = fields.ClusterStatus.UPDATE_FAILED nodegroup.status_reason = six.text_type(e) nodegroup.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE, cluster) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster
def poll_and_check(self): # TODO(yuanying): temporary implementation to update api_address, # node_addresses and cluster status stack = self.openstack_client.heat().stacks.get(self.cluster.stack_id) self.attempts += 1 status_to_event = { fields.ClusterStatus.DELETE_COMPLETE: taxonomy.ACTION_DELETE, fields.ClusterStatus.CREATE_COMPLETE: taxonomy.ACTION_CREATE, fields.ClusterStatus.UPDATE_COMPLETE: taxonomy.ACTION_UPDATE, fields.ClusterStatus.ROLLBACK_COMPLETE: taxonomy.ACTION_UPDATE, fields.ClusterStatus.CREATE_FAILED: taxonomy.ACTION_CREATE, fields.ClusterStatus.DELETE_FAILED: taxonomy.ACTION_DELETE, fields.ClusterStatus.UPDATE_FAILED: taxonomy.ACTION_UPDATE, fields.ClusterStatus.ROLLBACK_FAILED: taxonomy.ACTION_UPDATE } # poll_and_check is detached and polling long time to check status, # so another user/client can call delete cluster/stack. if stack.stack_status == fields.ClusterStatus.DELETE_COMPLETE: self._delete_complete() conductor_utils.notify_about_cluster_operation( self.context, status_to_event[stack.stack_status], taxonomy.OUTCOME_SUCCESS) raise loopingcall.LoopingCallDone() if stack.stack_status in (fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE): self._sync_cluster_and_template_status(stack) conductor_utils.notify_about_cluster_operation( self.context, status_to_event[stack.stack_status], taxonomy.OUTCOME_SUCCESS) raise loopingcall.LoopingCallDone() elif stack.stack_status != self.cluster.status: self._sync_cluster_status(stack) if stack.stack_status in (fields.ClusterStatus.CREATE_FAILED, fields.ClusterStatus.DELETE_FAILED, fields.ClusterStatus.UPDATE_FAILED, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.ROLLBACK_FAILED): self._sync_cluster_and_template_status(stack) self._cluster_failed(stack) conductor_utils.notify_about_cluster_operation( self.context, status_to_event[stack.stack_status], taxonomy.OUTCOME_FAILURE) raise loopingcall.LoopingCallDone() # only check max attempts when the stack is being created when # the timeout hasn't been set. If the timeout has been set then # the loop will end when the stack completes or the timeout occurs if stack.stack_status == fields.ClusterStatus.CREATE_IN_PROGRESS: if (stack.timeout_mins is None and self.attempts > CONF.cluster_heat.max_attempts): LOG.error( _LE('Cluster check exit after %(attempts)s attempts,' 'stack_id: %(id)s, stack_status: %(status)s') % { 'attempts': CONF.cluster_heat.max_attempts, 'id': self.cluster.stack_id, 'status': stack.stack_status }) raise loopingcall.LoopingCallDone() else: if self.attempts > CONF.cluster_heat.max_attempts: LOG.error( _LE('Cluster check exit after %(attempts)s attempts,' 'stack_id: %(id)s, stack_status: %(status)s') % { 'attempts': CONF.cluster_heat.max_attempts, 'id': self.cluster.stack_id, 'status': stack.stack_status }) raise loopingcall.LoopingCallDone()
def cluster_resize(self, context, cluster, node_count, nodes_to_remove, nodegroup): LOG.debug('cluster_conductor cluster_resize') osc = clients.OpenStackClients(context) # NOTE(flwang): One of important user cases of /resize API is # supporting the auto scaling action triggered by Kubernetes Cluster # Autoscaler, so there are 2 cases may happen: # 1. API could be triggered very offen # 2. Scale up or down may fail and we would like to offer the ability # that recover the cluster to allow it being resized when last # update failed. allow_update_status = ( fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE, fields.ClusterStatus.UPDATE_FAILED, fields.ClusterStatus.UPDATE_IN_PROGRESS, ) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Resizing a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) resize_manager = scale_manager.get_scale_manager(context, osc, cluster) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Backup the old node count so that we can restore it # in case of an exception. old_node_count = nodegroup.node_count # Resize cluster try: nodegroup.node_count = node_count nodegroup.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) cluster_driver.resize_cluster(context, cluster, resize_manager, node_count, nodes_to_remove, nodegroup) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() nodegroup.node_count = old_node_count nodegroup.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster
def poll_and_check(self): # TODO(yuanying): temporary implementation to update api_address, # node_addresses and cluster status stack = self.openstack_client.heat().stacks.get(self.cluster.stack_id) self.attempts += 1 status_to_event = { fields.ClusterStatus.DELETE_COMPLETE: taxonomy.ACTION_DELETE, fields.ClusterStatus.CREATE_COMPLETE: taxonomy.ACTION_CREATE, fields.ClusterStatus.UPDATE_COMPLETE: taxonomy.ACTION_UPDATE, fields.ClusterStatus.ROLLBACK_COMPLETE: taxonomy.ACTION_UPDATE, fields.ClusterStatus.CREATE_FAILED: taxonomy.ACTION_CREATE, fields.ClusterStatus.DELETE_FAILED: taxonomy.ACTION_DELETE, fields.ClusterStatus.UPDATE_FAILED: taxonomy.ACTION_UPDATE, fields.ClusterStatus.ROLLBACK_FAILED: taxonomy.ACTION_UPDATE } # poll_and_check is detached and polling long time to check status, # so another user/client can call delete cluster/stack. if stack.stack_status == fields.ClusterStatus.DELETE_COMPLETE: self._delete_complete() conductor_utils.notify_about_cluster_operation( self.context, status_to_event[stack.stack_status], taxonomy.OUTCOME_SUCCESS) raise loopingcall.LoopingCallDone() if stack.stack_status in (fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE): self._sync_cluster_and_template_status(stack) conductor_utils.notify_about_cluster_operation( self.context, status_to_event[stack.stack_status], taxonomy.OUTCOME_SUCCESS) raise loopingcall.LoopingCallDone() elif stack.stack_status != self.cluster.status: self._sync_cluster_status(stack) if stack.stack_status in (fields.ClusterStatus.CREATE_FAILED, fields.ClusterStatus.DELETE_FAILED, fields.ClusterStatus.UPDATE_FAILED, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.ROLLBACK_FAILED): self._sync_cluster_and_template_status(stack) self._cluster_failed(stack) conductor_utils.notify_about_cluster_operation( self.context, status_to_event[stack.stack_status], taxonomy.OUTCOME_FAILURE) raise loopingcall.LoopingCallDone() # only check max attempts when the stack is being created when # the timeout hasn't been set. If the timeout has been set then # the loop will end when the stack completes or the timeout occurs if stack.stack_status == fields.ClusterStatus.CREATE_IN_PROGRESS: if (stack.timeout_mins is None and self.attempts > CONF.cluster_heat.max_attempts): LOG.error(_LE('Cluster check exit after %(attempts)s attempts,' 'stack_id: %(id)s, stack_status: %(status)s') % {'attempts': CONF.cluster_heat.max_attempts, 'id': self.cluster.stack_id, 'status': stack.stack_status}) raise loopingcall.LoopingCallDone() else: if self.attempts > CONF.cluster_heat.max_attempts: LOG.error(_LE('Cluster check exit after %(attempts)s attempts,' 'stack_id: %(id)s, stack_status: %(status)s') % {'attempts': CONF.cluster_heat.max_attempts, 'id': self.cluster.stack_id, 'status': stack.stack_status}) raise loopingcall.LoopingCallDone()
def cluster_update(self, context, cluster, node_count, rollback=False): LOG.debug('cluster_heat cluster_update') osc = clients.OpenStackClients(context) allow_update_status = ( fields.ClusterStatus.CREATE_COMPLETE, fields.ClusterStatus.UPDATE_COMPLETE, fields.ClusterStatus.RESUME_COMPLETE, fields.ClusterStatus.RESTORE_COMPLETE, fields.ClusterStatus.ROLLBACK_COMPLETE, fields.ClusterStatus.SNAPSHOT_COMPLETE, fields.ClusterStatus.CHECK_COMPLETE, fields.ClusterStatus.ADOPT_COMPLETE ) if cluster.status not in allow_update_status: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) operation = _('Updating a cluster when status is ' '"%s"') % cluster.status raise exception.NotSupported(operation=operation) # Updates will be only reflected to the default worker # nodegroup. worker_ng = cluster.default_ng_worker if worker_ng.node_count == node_count: return # Backup the old node count so that we can restore it # in case of an exception. old_node_count = worker_ng.node_count manager = scale_manager.get_scale_manager(context, osc, cluster) # Get driver ct = conductor_utils.retrieve_cluster_template(context, cluster) cluster_driver = driver.Driver.get_driver(ct.server_type, ct.cluster_distro, ct.coe) # Update cluster try: conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING) worker_ng.node_count = node_count worker_ng.save() cluster_driver.update_cluster(context, cluster, manager, rollback) cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS cluster.status_reason = None except Exception as e: cluster.status = fields.ClusterStatus.UPDATE_FAILED cluster.status_reason = six.text_type(e) cluster.save() # Restore the node_count worker_ng.node_count = old_node_count worker_ng.save() conductor_utils.notify_about_cluster_operation( context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE) if isinstance(e, exc.HTTPBadRequest): e = exception.InvalidParameterValue(message=six.text_type(e)) raise e raise cluster.save() return cluster