def main(): args = get_parser().parse_args() # Set up logging to use the console console = logging.StreamHandler(sys.stderr) formatter = logging.Formatter( '[%(asctime)s] %(levelname)-8s %(message)s') console.setFormatter(formatter) root_logger.addHandler(console) if args.debug: root_logger.setLevel(logging.DEBUG) else: root_logger.setLevel(logging.INFO) _validate_conn_options(args) nosql_conf = cfg.ConfigOpts() db_options.set_defaults(nosql_conf, args.nosql_conn) nosql_conf.register_opts(storage.OPTS, 'database') nosql_conn = storage.get_connection_from_config(nosql_conf) sql_conf = cfg.ConfigOpts() db_options.set_defaults(sql_conf, args.sql_conn) sql_conf.register_opts(storage.OPTS, 'database') sql_conn = storage.get_connection_from_config(sql_conf) root_logger.info( _LI("Starting to migrate alarms data from NoSQL to SQL...")) count = 0 for alarm in nosql_conn.get_alarms(): root_logger.debug("Migrating alarm %s..." % alarm.alarm_id) try: sql_conn.create_alarm(alarm) count += 1 except exception.DBDuplicateEntry: root_logger.warning(_LW("Duplicated alarm %s found, skipped."), alarm.alarm_id) if not args.migrate_history: continue history_count = 0 for history in nosql_conn.get_alarm_changes(alarm.alarm_id, None): history_data = history.as_dict() root_logger.debug(" Migrating alarm history data with" " event_id %s..." % history_data['event_id']) try: sql_conn.record_alarm_change(history_data) history_count += 1 except exception.DBDuplicateEntry: root_logger.warning( _LW(" Duplicated alarm history %s found, skipped."), history_data['event_id']) root_logger.info(_LI(" Migrated %(count)s history data of alarm " "%(alarm_id)s"), {'count': history_count, 'alarm_id': alarm.alarm_id}) root_logger.info(_LI("End alarms data migration from NoSQL to SQL, %s" " alarms have been migrated."), count)
def conversion(): confirm = moves.input("This tool is used for converting the combination " "alarms to composite alarms, please type 'yes' to " "confirm: ") if confirm != 'yes': print("Alarm conversion aborted!") return args = get_parser().parse_args() conf = service.prepare_service() conn = storage.get_connection_from_config(conf) combination_alarms = list(conn.get_alarms(alarm_type='combination', alarm_id=args.alarm_id or None)) count = 0 for alarm in combination_alarms: new_name = 'From-combination: %s' % alarm.alarm_id n_alarm = list(conn.get_alarms(name=new_name, alarm_type='composite')) if n_alarm: LOG.warning(_LW('Alarm %(alarm)s has been already converted as ' 'composite alarm: %(n_alarm_id)s, skipped.'), {'alarm': alarm.alarm_id, 'n_alarm_id': n_alarm[0].alarm_id}) continue try: composite_rule = _generate_composite_rule(conn, alarm) except DependentAlarmNotFound as e: LOG.warning(_LW('The dependent alarm %(dep_alarm)s of alarm %' '(com_alarm)s not found, skipped.'), {'com_alarm': e.com_alarm_id, 'dep_alarm': e.dependent_alarm_id}) continue except UnsupportedSubAlarmType as e: LOG.warning(_LW('Alarm conversion from combination to composite ' 'only support combination alarms depending ' 'threshold alarms, the type of alarm %(alarm)s ' 'is: %(type)s, skipped.'), {'alarm': e.sub_alarm_id, 'type': e.sub_alarm_type}) continue new_alarm = models.Alarm(**alarm.as_dict()) new_alarm.alarm_id = str(uuid.uuid4()) new_alarm.name = new_name new_alarm.type = 'composite' new_alarm.description = ('composite alarm converted from combination ' 'alarm: %s' % alarm.alarm_id) new_alarm.rule = composite_rule new_alarm.timestamp = datetime.datetime.now() conn.create_alarm(new_alarm) LOG.info(_LI('End Converting combination alarm %(s_alarm)s to ' 'composite alarm %(d_alarm)s'), {'s_alarm': alarm.alarm_id, 'd_alarm': new_alarm.alarm_id}) count += 1 if args.delete_combination_alarm: for alarm in combination_alarms: LOG.info(_LI('Deleting the combination alarm %s...'), alarm.alarm_id) conn.delete_alarm(alarm.alarm_id) LOG.info(_LI('%s combination alarms have been converted to composite ' 'alarms.'), count)
def notify(self, action, alarm_id, alarm_name, severity, previous, current, reason, reason_data, headers=None): headers = headers or {} if 'x-openstack-request-id' not in headers: headers['x-openstack-request-id'] = b'req-' + str( uuid.uuid4()).encode('ascii') LOG.info(_LI( "Notifying alarm %(alarm_name)s %(alarm_id)s with severity" " %(severity)s from %(previous)s to %(current)s with action " "%(action)s because %(reason)s. request-id: %(request_id)s ") % ({'alarm_name': alarm_name, 'alarm_id': alarm_id, 'severity': severity, 'previous': previous, 'current': current, 'action': action, 'reason': reason, 'request_id': headers['x-openstack-request-id']})) body = {'alarm_name': alarm_name, 'alarm_id': alarm_id, 'severity': severity, 'previous': previous, 'current': current, 'reason': reason, 'reason_data': reason_data} headers['content-type'] = 'application/json' kwargs = {'data': jsonutils.dumps(body), 'headers': headers} if action.scheme == 'https': default_verify = int(self.conf.rest_notifier_ssl_verify) options = urlparse.parse_qs(action.query) verify = bool(int(options.get('aodh-alarm-ssl-verify', [default_verify])[-1])) if verify and self.conf.rest_notifier_ca_bundle_certificate_path: verify = self.conf.rest_notifier_ca_bundle_certificate_path kwargs['verify'] = verify cert = self.conf.rest_notifier_certificate_file key = self.conf.rest_notifier_certificate_key if cert: kwargs['cert'] = (cert, key) if key else cert # FIXME(rhonjo): Retries are automatically done by urllib3 in requests # library. However, there's no interval between retries in urllib3 # implementation. It will be better to put some interval between # retries (future work). max_retries = self.conf.rest_notifier_max_retries session = requests.Session() session.mount(action.geturl(), requests.adapters.HTTPAdapter(max_retries=max_retries)) resp = session.post(action.geturl(), **kwargs) LOG.info(_LI('Notifying alarm <%(id)s> gets response: %(status_code)s ' '%(reason)s.'), {'id': alarm_id, 'status_code': resp.status_code, 'reason': resp.reason})
def leave_group(self, group_id): if group_id not in self._groups: return if self._coordinator: self._coordinator.leave_group(group_id) self._groups.remove(group_id) LOG.info(_LI('Left partitioning group %s'), group_id)
def check_alarm_actions(alarm): max_actions = pecan.request.cfg.api.alarm_max_actions for state in state_kind: actions_name = state.replace(" ", "_") + "_actions" actions = getattr(alarm, actions_name) if not actions: continue action_set = set(actions) if len(actions) != len(action_set): LOG.info(_LI("duplicate actions are found: %s, " "remove duplicate ones"), actions) actions = list(action_set) setattr(alarm, actions_name, actions) if 0 < max_actions < len(actions): error = _("%(name)s count exceeds maximum value " "%(maximum)d") % { "name": actions_name, "maximum": max_actions, } raise base.ClientSideError(error) limited = rbac.get_limited_to_project(pecan.request.headers, pecan.request.enforcer) for action in actions: try: url = netutils.urlsplit(action) except Exception: error = _("Unable to parse action %s") % action raise base.ClientSideError(error) if url.scheme not in ACTIONS_SCHEMA: error = _("Unsupported action %s") % action raise base.ClientSideError(error) if limited and url.scheme in ("log", "test"): error = _("You are not authorized to create " "action: %s") % action raise base.ClientSideError(error, status_code=401)
def _refresh(self, alarm, state, reason, reason_data, always_record=False): """Refresh alarm state.""" try: previous = alarm.state alarm.state = state if previous != state or always_record: LOG.info(_LI('alarm %(id)s transitioning to %(state)s because ' '%(reason)s'), {'id': alarm.alarm_id, 'state': state, 'reason': reason}) try: self._storage_conn.update_alarm(alarm) except storage.AlarmNotFound: LOG.warning(_LW("Skip updating this alarm's state, the" "alarm: %s has been deleted"), alarm.alarm_id) else: self._record_change(alarm, reason) self.notifier.notify(alarm, previous, reason, reason_data) elif alarm.repeat_actions: self.notifier.notify(alarm, previous, reason, reason_data) except Exception: # retry will occur naturally on the next evaluation # cycle (unless alarm state reverts in the meantime) LOG.exception(_LE('alarm state update failed'))
def notify(self, action, alarm_id, alarm_name, severity, previous, current, reason, reason_data, headers=None): LOG.info( _LI( "Notifying alarm %(alarm_name)s %(alarm_id)s of %(severity)s " "priority from %(previous)s to %(current)s with action %(action)s" " because %(reason)s." ) % ( { "alarm_name": alarm_name, "alarm_id": alarm_id, "severity": severity, "previous": previous, "current": current, "action": action, "reason": reason, } ) ) body = { "alarm_name": alarm_name, "alarm_id": alarm_id, "severity": severity, "previous": previous, "current": current, "reason": reason, "reason_data": reason_data, } message = dict(body=body) self.notify_zaqar(action, message)
def _refresh(self, alarm, state, reason, reason_data, always_record=False): """Refresh alarm state.""" try: previous = alarm.state alarm.state = state if previous != state or always_record: LOG.info( _LI('alarm %(id)s transitioning to %(state)s because ' '%(reason)s'), { 'id': alarm.alarm_id, 'state': state, 'reason': reason }) try: self._storage_conn.update_alarm(alarm) except storage.AlarmNotFound: LOG.warning( _LW("Skip updating this alarm's state, the" "alarm: %s has been deleted"), alarm.alarm_id) else: self._record_change(alarm, reason) self.notifier.notify(alarm, previous, reason, reason_data) elif alarm.repeat_actions: self.notifier.notify(alarm, previous, reason, reason_data) except Exception: # retry will occur naturally on the next evaluation # cycle (unless alarm state reverts in the meantime) LOG.exception(_LE('alarm state update failed'))
def notify(self, action, alarm_id, alarm_name, severity, previous, current, reason, reason_data, headers=None): LOG.info( _LI("Notifying alarm %(alarm_name)s %(alarm_id)s of %(severity)s " "priority from %(previous)s to %(current)s with action %(action)s" " because %(reason)s.") % ({ 'alarm_name': alarm_name, 'alarm_id': alarm_id, 'severity': severity, 'previous': previous, 'current': current, 'action': action, 'reason': reason })) body = { 'alarm_name': alarm_name, 'alarm_id': alarm_id, 'severity': severity, 'previous': previous, 'current': current, 'reason': reason, 'reason_data': reason_data } message = dict(body=body) self.notify_zaqar(action, message, headers)
def _generate_composite_rule(conn, combin_alarm): alarm_ids = combin_alarm.rule['alarm_ids'] com_op = combin_alarm.rule['operator'] LOG.info( _LI('Start converting combination alarm %(alarm)s, it depends on ' 'alarms: %(alarm_ids)s'), { 'alarm': combin_alarm.alarm_id, 'alarm_ids': str(alarm_ids) }) threshold_rules = [] for alarm_id in alarm_ids: try: sub_alarm = list(conn.get_alarms(alarm_id=alarm_id))[0] except IndexError: raise DependentAlarmNotFound(combin_alarm.alarm_id, alarm_id) if sub_alarm.type in ('threshold', 'gnocchi_resources_threshold', 'gnocchi_aggregation_by_metrics_threshold', 'gnocchi_aggregation_by_resources_threshold'): sub_alarm.rule.update(type=sub_alarm.type) threshold_rules.append(sub_alarm.rule) elif sub_alarm.type == 'combination': threshold_rules.append(_generate_composite_rule(conn, sub_alarm)) else: raise UnsupportedSubAlarmType(alarm_id, sub_alarm.type) else: return {com_op: threshold_rules}
def _evaluate_assigned_alarms(self): try: alarms = self._assigned_alarms() LOG.info(_LI('initiating evaluation cycle on %d alarms'), len(alarms)) for alarm in alarms: self._evaluate_alarm(alarm) except Exception: LOG.exception(_LE('alarm evaluation cycle failed'))
def expirer(): conf = service.prepare_service() if conf.database.alarm_history_time_to_live > 0: LOG.debug("Clearing expired alarm history data") storage_conn = storage.get_connection_from_config(conf) storage_conn.clear_expired_alarm_history_data(conf.database.alarm_history_time_to_live) else: LOG.info(_LI("Nothing to clean, database alarm history time to live " "is disabled"))
def start(self): if self.backend_url: try: self._coordinator = tooz.coordination.get_coordinator( self.backend_url, self._my_id) self._coordinator.start() LOG.info(_LI('Coordination backend started successfully.')) except tooz.coordination.ToozError: LOG.exception(_LE('Error connecting to coordination backend.'))
def clear_expired_alarm_history_data(alarm_history_ttl): """Clear expired alarm history data from the backend storage system. Clearing occurs according to the time-to-live. :param alarm_history_ttl: Number of seconds to keep alarm history records for. """ LOG.info(_LI('Dropping alarm history data with TTL %d'), alarm_history_ttl)
def clear_expired_alarm_history_data(self, alarm_history_ttl): """Clear expired alarm history data from the backend storage system. Clearing occurs according to the time-to-live. :param alarm_history_ttl: Number of seconds to keep alarm history records for. """ LOG.info(_LI('Dropping alarm history data with TTL %d'), alarm_history_ttl)
def expirer(): conf = service.prepare_service() if conf.database.alarm_history_time_to_live > 0: LOG.debug("Clearing expired alarm history data") storage_conn = storage.get_connection_from_config(conf) storage_conn.clear_expired_alarm_history_data( conf.database.alarm_history_time_to_live) else: LOG.info(_LI("Nothing to clean, database alarm history time to live " "is disabled"))
def start(self): backend_url = cfg.CONF.coordination.backend_url if backend_url: try: self._coordinator = tooz.coordination.get_coordinator( backend_url, self._my_id) self._coordinator.start() self._started = True LOG.info(_LI('Coordination backend started successfully.')) except tooz.coordination.ToozError: self._started = False LOG.exception(_LE('Error connecting to coordination backend.'))
def load_app(conf): # Build the WSGI app cfg_file = None cfg_path = conf.api.paste_config if not os.path.isabs(cfg_path): cfg_file = conf.find_file(cfg_path) elif os.path.exists(cfg_path): cfg_file = cfg_path if not cfg_file: raise cfg.ConfigFilesNotFoundError([conf.api.paste_config]) LOG.info(_LI("Full WSGI config used: %s"), cfg_file) return deploy.loadapp("config:" + cfg_file)
def clear_expired_alarm_history_data(self, alarm_history_ttl): """Clear expired alarm history data from the backend storage system. Clearing occurs according to the time-to-live. :param alarm_history_ttl: Number of seconds to keep alarm history records for. """ session = self._engine_facade.get_session() with session.begin(): valid_start = (timeutils.utcnow() - datetime.timedelta(seconds=alarm_history_ttl)) deleted_rows = (session.query(models.AlarmChange).filter( models.AlarmChange.timestamp < valid_start).delete()) LOG.info(_LI("%d alarm histories are removed from database"), deleted_rows)
def clear_expired_alarm_history_data(self, alarm_history_ttl): """Clear expired alarm history data from the backend storage system. Clearing occurs according to the time-to-live. :param alarm_history_ttl: Number of seconds to keep alarm history records for. """ session = self._engine_facade.get_session() with session.begin(): valid_start = (timeutils.utcnow() - datetime.timedelta(seconds=alarm_history_ttl)) deleted_rows = (session.query(models.AlarmChange) .filter(models.AlarmChange.timestamp < valid_start) .delete()) LOG.info(_LI("%d alarm histories are removed from database"), deleted_rows)
def join_group(self, group_id): if not self._coordinator or not self._started or not group_id: return while True: try: join_req = self._coordinator.join_group(group_id) join_req.get() LOG.info(_LI('Joined partitioning group %s'), group_id) break except tooz.coordination.MemberAlreadyExist: return except tooz.coordination.GroupNotCreated: create_grp_req = self._coordinator.create_group(group_id) try: create_grp_req.get() except tooz.coordination.GroupAlreadyExist: pass self._groups.add(group_id)
def _inner(): try: join_req = self._coordinator.join_group(group_id) join_req.get() LOG.info(_LI('Joined partitioning group %s'), group_id) except tooz.coordination.MemberAlreadyExist: return except tooz.coordination.GroupNotCreated: create_grp_req = self._coordinator.create_group(group_id) try: create_grp_req.get() except tooz.coordination.GroupAlreadyExist: pass raise ErrorJoiningPartitioningGroup() except tooz.coordination.ToozError: LOG.exception(_LE('Error joining partitioning group %s,' ' re-trying'), group_id) raise ErrorJoiningPartitioningGroup() self._groups.add(group_id)
def check_alarm_actions(alarm): max_actions = pecan.request.cfg.api.alarm_max_actions for state in state_kind: actions_name = state.replace(" ", "_") + '_actions' actions = getattr(alarm, actions_name) if not actions: continue action_set = set(actions) if len(actions) != len(action_set): LOG.info( _LI('duplicate actions are found: %s, ' 'remove duplicate ones'), actions) actions = list(action_set) setattr(alarm, actions_name, actions) if 0 < max_actions < len(actions): error = _('%(name)s count exceeds maximum value ' '%(maximum)d') % { "name": actions_name, "maximum": max_actions } raise base.ClientSideError(error) limited = rbac.get_limited_to_project(pecan.request.headers, pecan.request.enforcer) for action in actions: try: url = netutils.urlsplit(action) except Exception: error = _("Unable to parse action %s") % action raise base.ClientSideError(error) if url.scheme not in ACTIONS_SCHEMA: error = _("Unsupported action %s") % action raise base.ClientSideError(error) if limited and url.scheme in ('log', 'test'): error = _('You are not authorized to create ' 'action: %s') % action raise base.ClientSideError(error, status_code=401)
def load_app(conf): global APPCONFIGS # Build the WSGI app cfg_path = conf.api.paste_config if not os.path.isabs(cfg_path): cfg_path = conf.find_file(cfg_path) if cfg_path is None or not os.path.exists(cfg_path): raise cfg.ConfigFilesNotFoundError([conf.api.paste_config]) config = dict(conf=conf) configkey = str(uuid.uuid4()) APPCONFIGS[configkey] = config LOG.info(_LI("WSGI config used: %s"), cfg_path) return deploy.loadapp("config:" + cfg_path, name="aodh+" + ( conf.api.auth_mode if conf.api.auth_mode else "noauth" ), global_conf={'configkey': configkey})
def _generate_composite_rule(conn, combin_alarm): alarm_ids = combin_alarm.rule['alarm_ids'] com_op = combin_alarm.rule['operator'] LOG.info(_LI('Start converting combination alarm %(alarm)s, it depends on ' 'alarms: %(alarm_ids)s'), {'alarm': combin_alarm.alarm_id, 'alarm_ids': str(alarm_ids)}) threshold_rules = [] for alarm_id in alarm_ids: try: sub_alarm = list(conn.get_alarms(alarm_id=alarm_id))[0] except IndexError: raise DependentAlarmNotFound(combin_alarm.alarm_id, alarm_id) if sub_alarm.type in ('threshold', 'gnocchi_resources_threshold', 'gnocchi_aggregation_by_metrics_threshold', 'gnocchi_aggregation_by_resources_threshold'): sub_alarm.rule.update(type=sub_alarm.type) threshold_rules.append(sub_alarm.rule) elif sub_alarm.type == 'combination': threshold_rules.append(_generate_composite_rule(conn, sub_alarm)) else: raise UnsupportedSubAlarmType(alarm_id, sub_alarm.type) else: return {com_op: threshold_rules}
def notify(self, action, alarm_id, alarm_name, severity, previous, current, reason, reason_data, headers=None): headers = headers or {} if 'x-openstack-request-id' not in headers: headers['x-openstack-request-id'] = b'req-' + \ uuidutils.generate_uuid().encode('ascii') LOG.info( _LI("Notifying alarm %(alarm_name)s %(alarm_id)s with severity" " %(severity)s from %(previous)s to %(current)s with action " "%(action)s because %(reason)s. request-id: %(request_id)s ") % ({ 'alarm_name': alarm_name, 'alarm_id': alarm_id, 'severity': severity, 'previous': previous, 'current': current, 'action': action, 'reason': reason, 'request_id': headers['x-openstack-request-id'] })) body = { 'alarm_name': alarm_name, 'alarm_id': alarm_id, 'severity': severity, 'previous': previous, 'current': current, 'reason': reason, 'reason_data': reason_data } headers['content-type'] = 'application/json' kwargs = {'data': jsonutils.dumps(body), 'headers': headers} if action.scheme == 'https': default_verify = int(self.conf.rest_notifier_ssl_verify) options = urlparse.parse_qs(action.query) verify = bool( int( options.get('aodh-alarm-ssl-verify', [default_verify])[-1])) if verify and self.conf.rest_notifier_ca_bundle_certificate_path: verify = self.conf.rest_notifier_ca_bundle_certificate_path kwargs['verify'] = verify cert = self.conf.rest_notifier_certificate_file key = self.conf.rest_notifier_certificate_key if cert: kwargs['cert'] = (cert, key) if key else cert # FIXME(rhonjo): Retries are automatically done by urllib3 in requests # library. However, there's no interval between retries in urllib3 # implementation. It will be better to put some interval between # retries (future work). max_retries = self.conf.rest_notifier_max_retries session = requests.Session() session.mount(action.geturl(), requests.adapters.HTTPAdapter(max_retries=max_retries)) resp = session.post(action.geturl(), **kwargs) LOG.info( _LI('Notifying alarm <%(id)s> gets response: %(status_code)s ' '%(reason)s.'), { 'id': alarm_id, 'status_code': resp.status_code, 'reason': resp.reason })
def conversion(): args = get_parser().parse_args() conf = service.prepare_service([]) conn = storage.get_connection_from_config(conf) combination_alarms = list( conn.get_alarms(alarm_type='combination', alarm_id=args.alarm_id or None)) count = 0 for alarm in combination_alarms: new_name = 'From-combination: %s' % alarm.alarm_id n_alarm = list(conn.get_alarms(name=new_name, alarm_type='composite')) if n_alarm: LOG.warning( _LW('Alarm %(alarm)s has been already converted as ' 'composite alarm: %(n_alarm_id)s, skipped.'), { 'alarm': alarm.alarm_id, 'n_alarm_id': n_alarm[0].alarm_id }) continue try: composite_rule = _generate_composite_rule(conn, alarm) except DependentAlarmNotFound as e: LOG.warning( _LW('The dependent alarm %(dep_alarm)s of alarm %' '(com_alarm)s not found, skipped.'), { 'com_alarm': e.com_alarm_id, 'dep_alarm': e.dependent_alarm_id }) continue except UnsupportedSubAlarmType as e: LOG.warning( _LW('Alarm conversion from combination to composite ' 'only support combination alarms depending ' 'threshold alarms, the type of alarm %(alarm)s ' 'is: %(type)s, skipped.'), { 'alarm': e.sub_alarm_id, 'type': e.sub_alarm_type }) continue new_alarm = models.Alarm(**alarm.as_dict()) new_alarm.alarm_id = uuidutils.generate_uuid() new_alarm.name = new_name new_alarm.type = 'composite' new_alarm.description = ('composite alarm converted from combination ' 'alarm: %s' % alarm.alarm_id) new_alarm.rule = composite_rule new_alarm.timestamp = datetime.datetime.now() conn.create_alarm(new_alarm) LOG.info( _LI('End Converting combination alarm %(s_alarm)s to ' 'composite alarm %(d_alarm)s'), { 's_alarm': alarm.alarm_id, 'd_alarm': new_alarm.alarm_id }) count += 1 if args.delete_combination_alarm: for alarm in combination_alarms: LOG.info(_LI('Deleting the combination alarm %s...'), alarm.alarm_id) conn.delete_alarm(alarm.alarm_id) LOG.info( _LI('%s combination alarms have been converted to composite ' 'alarms.'), count)