def info(self, ctxt, publisher_id, event_type, payload, metadata): meta = payload['metadata'] cluster_id = meta.get('cluster_id') if not cluster_id: return if self.cluster_id != cluster_id: return if event_type not in self.VM_FAILURE_EVENTS: return params = { 'event': self.VM_FAILURE_EVENTS[event_type], 'state': payload.get('state', 'Unknown'), 'instance_id': payload.get('instance_id', 'Unknown'), 'timestamp': metadata['timestamp'], 'publisher': publisher_id, 'operation': self.recover_action['operation'], } node_id = meta.get('cluster_node_id') if node_id: LOG.info("Requesting node recovery: %s", node_id) ctx = context.get_service_context(project_id=self.project_id, user_id=payload['user_id']) req = objects.NodeRecoverRequest(identity=node_id, params=params) self.rpc.call(ctx, 'node_recover', req)
def info(self, ctxt, publisher_id, event_type, payload, metadata): if event_type not in self.STACK_FAILURE_EVENTS: return tags = payload['tags'] if tags is None or tags == []: return cluster_id = None node_id = None for tag in tags: if cluster_id is None: start = tag.find('cluster_id') if start == 0 and tag[11:] == self.cluster_id: cluster_id = tag[11:] if node_id is None: start = tag.find('cluster_node_id') if start == 0: node_id = tag[16:] if cluster_id is None or node_id is None: return params = { 'event': self.STACK_FAILURE_EVENTS[event_type], 'state': payload.get('state', 'Unknown'), 'stack_id': payload.get('stack_identity', 'Unknown'), 'timestamp': metadata['timestamp'], 'publisher': publisher_id, } LOG.info("Requesting stack recovery: %s", node_id) ctx = context.get_service_context(project=self.project_id, user=payload['user_identity']) req = objects.NodeRecoverRequest(identity=node_id, params=params) self.rpc.call(ctx, 'node_recover', req)
def _recover_node(self, ctx, node_id): """Recover node :returns: Recover action """ try: req = objects.NodeRecoverRequest(identity=node_id, params=self.recover_action) return self.rpc_client.call(ctx, 'node_recover', req) except Exception as ex: LOG.error("Error when performing node recovery for %s: %s", node_id, ex) return None
def _recover_node(self, node_id, ctx, recover_action): """Recover node :returns: Recover action """ try: LOG.info("%s is requesting node recovery " "for %s.", self.__class__.__name__, node_id) req = objects.NodeRecoverRequest(identity=node_id, params=recover_action) return self.rpc_client.call(ctx, 'node_recover', req) except Exception as ex: LOG.error('Error when performing node recovery for %s: %s', node_id, ex) return None
def _poll_cluster(self, cluster_id, timeout, recover_action): """Routine to be executed for polling cluster status. :param cluster_id: The UUID of the cluster to be checked. :param timeout: The maximum number of seconds to wait. :param recover_action: The health policy action name. :returns: Nothing. """ start_time = timeutils.utcnow(True) cluster = objects.Cluster.get(self.ctx, cluster_id, project_safe=False) if not cluster: LOG.warning("Cluster (%s) is not found.", cluster_id) return _chase_up(start_time, timeout) ctx = context.get_service_context(user_id=cluster.user, project_id=cluster.project) params = {'delete_check_action': True} try: req = objects.ClusterCheckRequest(identity=cluster_id, params=params) action = self.rpc_client.call(ctx, 'cluster_check', req) except Exception as ex: LOG.warning( "Failed in triggering 'cluster_check' RPC for " "'%(c)s': %(r)s", { 'c': cluster_id, 'r': six.text_type(ex) }) return _chase_up(start_time, timeout) # wait for action to complete res, reason = self._wait_for_action(ctx, action['action'], timeout) if not res: LOG.warning("%s", reason) return _chase_up(start_time, timeout) # loop through nodes to trigger recovery nodes = objects.Node.get_all_by_cluster(ctx, cluster_id) for node in nodes: if node.status != consts.NS_ACTIVE: LOG.info("Requesting node recovery: %s", node.id) req = objects.NodeRecoverRequest(identity=node.id, params=recover_action) self.rpc_client.call(ctx, 'node_recover', req) return _chase_up(start_time, timeout)
def _check_url_and_recover_node(self, ctx, node, recover_action, params): """Routine to check a node status from a url and recovery if necessary :param ctx: The request context to use for recovery action :param node: The node to be checked. :param recover_action: The health policy action name. :param params: Parameters specific to poll url or recovery action :returns: action if node was triggered for recovery. Otherwise None. """ url_template = params['poll_url'] verify_ssl = params['poll_url_ssl_verify'] expected_resp_str = params['poll_url_healthy_response'] max_unhealthy_retry = params['poll_url_retry_limit'] retry_interval = params['poll_url_retry_interval'] node_update_timeout = params['node_update_timeout'] url = self._expand_url_template(url_template, node) LOG.info("Polling node status from URL: %s", url) available_attemps = max_unhealthy_retry while available_attemps > 0: available_attemps -= 1 try: result = utils.url_fetch(url, verify=verify_ssl) except utils.URLFetchError as ex: LOG.error( "Error when requesting node health status from" " %s: %s", url, ex) return None LOG.debug("Node status returned from URL(%s): %s", url, result) if re.search(expected_resp_str, result): LOG.debug('Node %s is healthy', node.id) return None if node.status != consts.NS_ACTIVE: LOG.info( "Skip node recovery because node %s is not in " "ACTIVE state", node.id) return None node_last_updated = node.updated_at or node.init_at if not timeutils.is_older_than(node_last_updated, node_update_timeout): LOG.info( "Node %s was updated at %s which is less than " "%d secs ago. Skip node recovery.", node.id, node_last_updated, node_update_timeout) return None LOG.info("Node %s is reported as down (%d retries left)", node.id, available_attemps) time.sleep(retry_interval) # recover node after exhausting retries LOG.info("Requesting node recovery: %s", node.id) req = objects.NodeRecoverRequest(identity=node.id, params=recover_action) return self.rpc_client.call(ctx, 'node_recover', req)