def __init__(self, queue, parent_pid): super(PrometheusMetrics, self).__init__() self.q = queue self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
def __init__(self, queue, parent_pid, skip_mini, worker_number, canary=False): super(Worker, self).__init__() # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow if settings.REDIS_PASSWORD: # @modified 20191014 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True) else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True) self.q = queue self.parent_pid = parent_pid self.daemon = True self.canary = canary self.skip_mini = skip_mini # @added 20201017 - Feature #3788: snab_flux_load_test # Feature #3680: horizon.worker.datapoints_sent_to_redis # Added worker_number self.worker_number = worker_number # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list # Added get_redis_conn_decoded self.redis_conn_decoded = get_redis_conn_decoded(parent_skyline_app)
def __init__(self, parent_pid): super(Aggregator, self).__init__() self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.parent_pid = parent_pid self.daemon = True self.current_pid = getpid()
def get_base_name_from_metric_id(current_skyline_app, metric_id): """ Returns a metric id for a base_name from the aet.metrics_manager.ids_with_metric_names Redis hash or the DB if not found in Redis. :param current_skyline_app: the app calling the function :param metric_id: the metric id to lookup the base_name for. :type current_skyline_app: str :type metric_id: int :return: base_name :rtype: str """ redis_key = 'aet.metrics_manager.ids_with_metric_names' function_str = 'functions.metrics.get_base_name_from_metric_id' current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: get_redis_conn_decoded failed - %s' % (current_skyline_app, function_str, e)) return metric_id base_name = None try: base_name = redis_conn_decoded.hget(redis_key, metric_id) # DEBUG current_logger.info( 'debug :: %s :: %s :: hget(%s, %s)' % (current_skyline_app, function_str, redis_key, str(metric_id))) except Exception as err: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: failed to get base_name for %s: %s' % (current_skyline_app, function_str, str(metric_id), str(err))) if not base_name: try: base_name = base_name_from_metric_id(current_skyline_app, metric_id, False) except Exception as err: current_logger.error( 'error :: %s :: %s :: base_name_from_metric_id falied to determine base_name from metric_id: %s - %s' % (current_skyline_app, function_str, str(metric_id), str(err))) return base_name
def __init__(self, parent_pid): """ Initialize Rolling """ super(RollingThunder, self).__init__() self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid()
def __init__(self, parent_pid): """ Initialize the SNAB_flux_load_test """ super(SNAB_flux_load_test, self).__init__() self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid()
def __init__(self, parent_pid): """ Initialize RelatedMetrics """ super(RelatedMetrics, self).__init__() self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid()
def __init__(self, parent_pid): """ Initialize Cloudbursts """ super(Cloudbursts, self).__init__() self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid()
def __init__(self, parent_pid): """ Initialize Luminosity Create the :obj:`redis_conn` a Redis client object Create the :obj:`correlations` list Create the :obj:`mysql_conn` MySQLConnection object Create the :obj:`memcache_client` a constructor that does not make a connection to memcached. The first call to a method on the object will do that. """ super(Luminosity, self).__init__() # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes # types need to be decoded as utf-8 to str # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Added a single functions to deal with Redis connection and the # charset='utf-8', decode_responses=True arguments required in py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.current_pid = getpid() # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # Task #3032: Debug number of Python processes and memory use # Branch #3002: docker # Reduce amount of Manager instances that are used as each requires a # copy of entire memory to be copied into each subprocess so this # results in a python process per Manager instance, using as much # memory as the parent. OK on a server, not so much in a container. # Disabled all the Manager().list() below and replaced with Redis sets # self.correlations = Manager().list() # @added 20180720 - Task #2462: Implement useful metrics for Luminosity # self.metrics_checked_for_correlation = Manager().list() # self.runtimes = Manager().list() self.mysql_conn = mysql.connector.connect(**config) if settings.MEMCACHE_ENABLED: self.memcache_client = pymemcache_Client( (settings.MEMCACHED_SERVER_IP, settings.MEMCACHED_SERVER_PORT), connect_timeout=0.1, timeout=0.2) else: self.memcache_client = None
def __init__(self, parent_pid): super(Worker, self).__init__() self.parent_pid = parent_pid self.daemon = True # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Added a single functions to deal with Redis connection and the # charset='utf-8', decode_responses=True arguments required in py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
def __init__(self, queue, parent_pid): super(Worker, self).__init__() # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.q = queue self.parent_pid = parent_pid self.daemon = True
def check_redis_key(current_skyline_app, redis_key, log=True): """ Check a Redis key. :param current_skyline_app: the app calling the function :param redis_key: the Redis key name :param log: whether to log or not, optional, defaults to True :type current_skyline_app: str :type redis_key: str :type log: boolean :return: data :rtype: object """ function_str = 'functions.redis.check_redis_key' data = None if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None redis_conn_decoded = None try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: if log: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to connect to Redis to get %s - %s' % (function_str, redis_key, e)) if not redis_conn_decoded: return data try: data = redis_conn_decoded.get(redis_key) except Exception as e: if log: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to remove item from Redis set %s - %s' % (function_str, redis_key, e)) data = None return data
def get_base_names_and_metric_ids(current_skyline_app): """ Returns a dict of base_names with their metric id from the aet.metrics_manager.ids_with_metric_names Redis hash. :param current_skyline_app: the app calling the function :param metric_id: the metric id to lookup the base_name for. :type current_skyline_app: str :type metric_id: int :return: base_name :rtype: str """ base_names_with_ids = {} redis_key = 'aet.metrics_manager.metric_names_with_ids' function_str = 'functions.metrics.get_base_name_from_metric_id' try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: get_redis_conn_decoded failed - %s' % (current_skyline_app, function_str, e)) return base_names_with_ids try: base_names_with_ids = redis_conn_decoded.hgetall(redis_key) if base_names_with_ids: # Format cast the id str as an int for base_name in list(base_names_with_ids.keys()): metric_id = int(str(base_names_with_ids[base_name])) base_names_with_ids[base_name] = metric_id except Exception as err: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: get_redis_conn_decoded failed - %s' % (current_skyline_app, function_str, str(err))) return base_names_with_ids
def get_metric_id_from_base_name(current_skyline_app, base_name): """ Returns a metric id for a base_name. :param current_skyline_app: the app calling the function :param base_name: the base_name of the metric to determine the latest anomaly for. Can be None if metric_id is passed as a positive int :return: metric_id :rtype: int """ metric_id = 0 redis_key = 'aet.metrics_manager.metric_names_with_ids' function_str = 'functions.metrics.get_metric_id_from_base_name' current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error('error :: %s :: %s :: get_redis_conn_decoded failed - %s' % ( current_skyline_app, function_str, e)) return metric_id metric_id_str = None try: metric_id_str = redis_conn_decoded.hget(redis_key, base_name) # DEBUG current_logger.info('debug :: %s :: %s :: hget(%s, %s)' % ( current_skyline_app, function_str, redis_key, str(base_name))) if metric_id_str: metric_id = int(str(metric_id_str)) except Exception as err: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error('error :: %s :: %s :: failed to get metric_id for %s: %s' % ( current_skyline_app, function_str, base_name, str(err))) return metric_id
def __init__(self, parent_pid, skip_mini): super(Roomba, self).__init__() # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Added a single functions to deal with Redis connection and the # charset='utf-8', decode_responses=True arguments required in py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) self.daemon = True self.parent_pid = parent_pid self.skip_mini = skip_mini
def run(self): """ Called when process initializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os_remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error( 'error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os_remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) logger.info('%s :: started roomba' % skyline_app) while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error( '%s :: roomba can\'t connect to redis at socket path %s' % (skyline_app, settings.REDIS_SOCKET_PATH)) sleep(10) # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 if settings.REDIS_PASSWORD: self.redis_conn = StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) continue # Spawn processes pids = [] for i in range(1, settings.ROOMBA_PROCESSES + 1): if not self.skip_mini: logger.info( '%s :: starting vacuum process on mini namespace' % skyline_app) p = Process(target=self.vacuum, args=(i, settings.MINI_NAMESPACE, settings.MINI_DURATION + settings.ROOMBA_GRACE_TIME)) pids.append(p) p.start() logger.info('%s :: starting vacuum process' % skyline_app) p = Process( target=self.vacuum, args=(i, settings.FULL_NAMESPACE, settings.FULL_DURATION + settings.ROOMBA_GRACE_TIME)) pids.append(p) p.start() # Send wait signal to zombie processes # for p in pids: # p.join() # deroomba - kill any lingering vacuum processes # Changed to manage Roomba processes as edge cases related to I/O # wait have been experienced that resulted in Roomba stalling so a # ROOMBA_TIMEOUT setting was added and here we use the pattern # described by http://stackoverflow.com/users/2073595/dano at # http://stackoverflow.com/a/26064238 to monitor and kill any # stalled processes rather than using p.join(TIMEOUT) - 20160505 # @earthgecko ref 1342 logger.info('%s :: allowing vacuum process/es %s seconds to run' % (skyline_app, str(settings.ROOMBA_TIMEOUT))) start = time() while time() - start <= settings.ROOMBA_TIMEOUT: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - start logger.info('%s :: vacuum processes completed in %.2f' % (skyline_app, time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('%s :: timed out, killing all Roomba processes' % (skyline_app)) for p in pids: p.terminate() p.join() # sleeping in the main process is more CPU efficient than sleeping # in the vacuum def also roomba is quite CPU intensive so we only # what to run roomba once every minute process_runtime = time() - now roomba_optimum_run_duration = 60 if process_runtime < roomba_optimum_run_duration: sleep_for = (roomba_optimum_run_duration - process_runtime) logger.info('%s :: sleeping %.2f for due to low run time' % (skyline_app, sleep_for)) sleep(sleep_for)
def run(self): """ Called when the process intializes. """ logger.info('aggregator :: starting aggregator') # Determine a primary aggregator aggregator_pid = getpid() main_process_pid = 0 try: main_process_pid = int( self.redis_conn_decoded.get('flux.main_process_pid')) if main_process_pid: logger.info( 'aggregator :: main_process_pid found in Redis key - %s' % str(main_process_pid)) except: main_process_pid = 0 if not main_process_pid: logger.error( 'error :: aggregator :: no main_process_pid known, exiting') sys.exit(1) primary_aggregator_key = 'flux.primary_aggregator_pid.%s' % str( main_process_pid) logger.info( 'aggregator :: starting primary_aggregator election using primary_aggregator_key: %s' % primary_aggregator_key) sleep_for = random.uniform(0.1, 1.5) logger.info( 'aggregator :: starting primary_aggregator election - sleeping for %s' % str(sleep_for)) sleep(sleep_for) primary_aggregator_pid = 0 try: primary_aggregator_pid = int( self.redis_conn_decoded.get(primary_aggregator_key)) if primary_aggregator_pid: logger.info( 'aggregator :: primary_aggregator_pid found in Redis key - %s' % str(primary_aggregator_pid)) except: primary_aggregator_pid = 0 if not primary_aggregator_pid: try: self.redis_conn.setex(primary_aggregator_key, 300, aggregator_pid) primary_aggregator_pid = int( self.redis_conn_decoded.get(primary_aggregator_key)) logger.info( 'aggregator :: set self pid to primary_aggregator - %s' % str(primary_aggregator_pid)) except: primary_aggregator_pid = 0 primary_aggregator = False if primary_aggregator_pid == aggregator_pid: primary_aggregator = True logger.info( 'aggregator :: primary_aggregator_pid is set to %s, primary_aggregator: %s' % (str(primary_aggregator_pid), str(primary_aggregator))) last_flush = int(time()) - 59 remove_from_flux_queue_redis_set = [] # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error( 'aggregator :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) try: self.redis_conn = get_redis_conn(skyline_app) except Exception as e: logger.error( 'error :: aggregator :: could not get_redis_conn - %s' % str(e)) try: self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) except Exception as e: logger.error( 'error :: aggregator :: could not get_redis_conn_decoded - %s' % str(e)) try: time_now = int(time()) while (time_now - last_flush) <= 59: sleep(1) remove_from_flux_queue_redis_set = [] time_now = int(time()) primary_aggregator_pid = 0 try: primary_aggregator_pid = int( self.redis_conn_decoded.get(primary_aggregator_key)) if primary_aggregator_pid: logger.info( 'aggregator :: primary_aggregator_pid found in Redis key - %s' % str(primary_aggregator_pid)) except: primary_aggregator_pid = 0 if not primary_aggregator_pid: try: self.redis_conn.setex(primary_aggregator_key, 300, aggregator_pid) primary_aggregator_pid = int( self.redis_conn_decoded.get( primary_aggregator_key)) logger.info( 'aggregator :: set self pid to primary_aggregator - %s' % str(primary_aggregator_pid)) except: primary_aggregator_pid = 0 primary_aggregator = False if primary_aggregator_pid == aggregator_pid: primary_aggregator = True logger.info( 'aggregator :: primary_aggregator_pid is set to %s, primary_aggregator: %s' % (str(primary_aggregator_pid), str(primary_aggregator))) flux_aggregator_queue = [] if primary_aggregator: logger.info('aggregator :: checking for data to aggregate') try: flux_aggregator_queue = self.redis_conn_decoded.smembers( 'flux.aggregator.queue') logger.info( 'aggregator :: %s entries in flux.aggregator.queue to process' % str(len(flux_aggregator_queue))) except: logger.error(traceback.format_exc()) logger.error( 'error :: could not get the flux.aggregator.queue set from Redis' ) else: logger.info( 'aggregator :: not primary, in standby to take over should the primary_aggregator fail' ) flux_aggregator_queue_items = [] all_metrics = [] if flux_aggregator_queue: for flux_aggregator_queue_item_str in flux_aggregator_queue: try: flux_aggregator_queue_item = literal_eval( flux_aggregator_queue_item_str) all_metrics.append(flux_aggregator_queue_item[0]) flux_aggregator_queue_items.append([ flux_aggregator_queue_item, flux_aggregator_queue_item_str ]) # self.redis_conn.srem('flux.aggregator.queue', flux_aggregator_queue_item_str) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to evaluate item from flux.aggregator.queue Redis set' ) metrics = list(set(all_metrics)) for metric in metrics: last_metric_flush = last_flush last_metric_flush_str = None try: last_metric_flush_str = self.redis_conn_decoded.hget( 'flux.aggregate_metrics.last_flush', metric) # Handle new metric without throwing an error if they do # not have an entry in the hash if last_metric_flush_str: last_metric_flush = int(last_metric_flush_str) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed convert last_metric_flush_str value to an int from flux.aggregate_metrics.last_flush Redis hash for %s' % metric) if not last_metric_flush: # Handle new metric without throwing an error if they do # not have an entry in the hash logger.info( 'aggregator :: probable new metric - no last_metric_flush found in flux.aggregate_metrics.last_flush Redis hash for %s using last_flush' % metric) last_metric_flush = last_flush metric_aggregation_settings = {} try: metric_aggregation_settings_str = self.redis_conn_decoded.hget( 'metrics_manager.flux.aggregate_namespaces.settings', metric) # @modified 20210718 if metric_aggregation_settings_str: metric_aggregation_settings = literal_eval( metric_aggregation_settings_str) else: metric_aggregation_settings = {} except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to determine aggregation_settings from metrics_manager.flux.aggregate_namespaces.settings Redis hash for %s' % metric) # @added 20210718 # Handle newly added metrics that have not been added to # metrics_manager.flux.aggregate_namespaces.settings due to # to the chicken or the egg problem if not metric_aggregation_settings: logger.info( 'aggregator :: probable new metric - %s not found in metrics_manager.flux.aggregate_namespaces.settings Redis hash' % metric) aggregate_namespaces = list( settings.FLUX_AGGREGATE_NAMESPACES.keys()) pattern_match, metric_matched_by = matched_or_regexed_in_list( 'flux', metric, aggregate_namespaces) if pattern_match: matched_namespace = metric_matched_by[ 'matched_namespace'] metric_aggregation_settings = settings.FLUX_AGGREGATE_NAMESPACES[ matched_namespace] logger.info( 'aggregator :: new metric - %s detemined metric_aggregation_settings from FLUX_AGGREGATE_NAMESPACES - %s' % (metric, str(metric_aggregation_settings))) else: logger.error( 'error :: aggregator :: new metric - %s could not detemine metric_aggregation_settings from FLUX_AGGREGATE_NAMESPACES' % (metric)) interval = 60 try: interval = int(metric_aggregation_settings['interval']) except: # logger.error(traceback.format_exc()) logger.error( 'error :: failed to get interval from metric_aggregation_settings for %s, setting to default 60' % metric) interval = 60 if (time_now - last_metric_flush) < interval: continue metric_values = [] for flux_aggregator_queue_item in flux_aggregator_queue_items: if flux_aggregator_queue_item[0][0] != metric: continue # Discard any values older than the last metric flush if int(flux_aggregator_queue_item[0] [2]) > last_metric_flush: metric_values.append( flux_aggregator_queue_item[0][1]) try: self.redis_conn.srem('flux.aggregator.queue', flux_aggregator_queue_item[1]) remove_from_flux_queue_redis_set.append( flux_aggregator_queue_item[1]) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to remove item from flux.aggregator.queue Redis set - %s' % str(flux_aggregator_queue_item[1])) if not metric_aggregation_settings: logger.error( 'error :: no aggregation settings known for %s, discarding data' % metric) continue if metric_values: methods = [] try: methods = metric_aggregation_settings['method'] except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to determine aggregation methods from metric_aggregation_settings - %s' % str(metric_aggregation_settings)) methods = [] for method in methods: try: metric_namespace = metric if metric_aggregation_settings[ 'method_suffix']: metric_namespace = '%s.%s' % (metric, method) else: # @added 20220126 - Feature #4400: flux - quota # If method_suffix is not set but multiple # methods are being used, method_suffix # must be applied, otherwise the metric will # have all the method values submitted to a # single metric name. if len(methods) > 1: metric_namespace = '%s.%s' % (metric, method) aggregate_value = None if method == 'avg': if len(metric_values) > 1: aggregate_value = sum( metric_values) / len(metric_values) else: aggregate_value = metric_values[0] if method == 'sum': aggregate_value = sum(metric_values) if method == 'max': aggregate_value = max(metric_values) if method == 'min': aggregate_value = min(metric_values) if aggregate_value is not None: try: backfill = False metric_data = [ metric_namespace, aggregate_value, (time_now - interval), backfill ] flux.httpMetricDataQueue.put( metric_data, block=False) logger.info('aggregator :: added %s' % (str(metric_data))) try: self.redis_conn.hset( 'flux.aggregate_metrics.last_flush', metric, time_now) except: logger.error( traceback.format_exc()) logger.error( 'error :: aggregator :: failed to set last metric flush time in Redis hash flux.aggregate_metrics.last_flush' ) except: logger.error(traceback.format_exc()) logger.error( 'error :: aggregator :: failed to add aggregator data to flux.httpMetricDataQueue - %s' % str(metric_data)) except: logger.error(traceback.format_exc()) logger.error( 'error :: aggregator :: failed to aggregate metric_values by a method for %s' % str(metric)) last_flush = time_now # flux_zero_fill_metrics = list(self.redis_conn_decoded.smembers('flux.zero_fill_metrics')) if FLUX_PERSIST_QUEUE: redis_set_size = 0 try: redis_set_size = self.redis_conn.scard('flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: aggregator :: failed to determine size of flux.queue Redis set' ) logger.info( 'aggregator :: flux.queue Redis set size of %s before removal of %s items' % (str(redis_set_size), str(len(remove_from_flux_queue_redis_set)))) if remove_from_flux_queue_redis_set: try: self.redis_conn.srem( 'flux.queue', *set(remove_from_flux_queue_redis_set)) remove_from_flux_queue_redis_set = [] except: logger.error(traceback.format_exc()) logger.error( 'error :: aggregator :: failed to remove multiple items from flux.queue Redis set' ) try: redis_set_size = self.redis_conn.scard( 'flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: aggregator :: failed to determine size of flux.queue Redis set' ) logger.info( 'aggregator :: flux.queue Redis set size of %s after the removal of items' % (str(redis_set_size))) remove_from_flux_queue_redis_set = [] if primary_aggregator: try: self.redis_conn.setex(primary_aggregator_key, 300, aggregator_pid) primary_aggregator_pid = int( self.redis_conn_decoded.get( primary_aggregator_key)) logger.info( 'aggregator :: set self pid to primary_aggregator - %s' % str(primary_aggregator_pid)) logger.info( 'aggregator :: set Redis primary_aggregator_key key to self pid to primary_aggregator - %s' % str(primary_aggregator_pid)) except Exception as e: logger.error( 'error :: aggregator :: failed to set Redis primary_aggregator_key key to self pid - %s' % (str(e))) except NotImplementedError: pass except KeyboardInterrupt: logger.info( 'aggregator :: server has been issued a user signal to terminate - KeyboardInterrupt' ) except SystemExit: logger.info( 'aggregator :: server was interrupted - SystemExit') except Exception as e: logger.error(traceback.format_exc()) logger.error('error :: aggregator :: %s' % (str(e)))
def adtk_level_shift(current_skyline_app, parent_pid, timeseries, algorithm_parameters): """ A timeseries is anomalous if a level shift occurs in a 5 window period bound by a factor of 9 of the normal range based on historical interquartile range. :param current_skyline_app: the Skyline app executing the algorithm. This will be passed to the algorithm by Skyline. This is **required** for error handling and logging. You do not have to worry about handling the argument in the scope of the custom algorithm itself, but the algorithm must accept it as the first agrument. :param parent_pid: the parent pid which is executing the algorithm, this is **required** for error handling and logging. You do not have to worry about handling this argument in the scope of algorithm, but the algorithm must accept it as the second argument. :param timeseries: the time series as a list e.g. ``[[1578916800.0, 29.0], [1578920400.0, 55.0], ... [1580353200.0, 55.0]]`` :param algorithm_parameters: a dictionary of any required parameters for the custom_algorithm and algorithm itself. For the matrixprofile custom algorithm the following parameters are required, example: ``algorithm_parameters={ 'c': 9.0, 'run_every': 5, 'side': 'both', 'window': 5 }`` :type current_skyline_app: str :type parent_pid: int :type timeseries: list :type algorithm_parameters: dict :return: True, False or Non :rtype: boolean Performance is of paramount importance in Skyline, especially in terms of computational complexity, along with execution time and CPU usage. The adtk LevelShiftAD algortihm is not O(n) and it is not fast either, not when compared to the normal three-sigma triggered algorithms. However it is useful if you care about detecting all level shifts. The normal three-sigma triggered algorithms do not always detect a level shift, especially if the level shift does not breach the three-sigma limits. Therefore you may find over time that you encounter alerts that contain level shifts that you thought should have been detected. On these types of metrics and events, the adtk LevelShiftAD algortihm can be implemented to detect and alert on these. It is not recommended to run on all your metrics as it would immediately triple the analyzer runtime every if only run every 5 windows/ minutes. Due to the computational complexity and long run time of the adtk LevelShiftAD algorithm on the size of timeseries data used by Skyline, if you consider the following timings of all three-sigma triggered algorithms and compare them to the to the adtk_level_shift results in the last 2 rows of the below log, it is clear that the running adtk_level_shift on all metrics is probably not desirable, even if it is possible to do, it is very noisy. 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - histogram_bins run 567 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - histogram_bins has 567 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - histogram_bins - total: 1.051136 - median: 0.001430 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - first_hour_average run 567 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - first_hour_average has 567 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - first_hour_average - total: 1.322432 - median: 0.001835 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - stddev_from_average run 567 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - stddev_from_average has 567 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - stddev_from_average - total: 1.097290 - median: 0.001641 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - grubbs run 567 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - grubbs has 567 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - grubbs - total: 1.742929 - median: 0.002438 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - ks_test run 147 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - ks_test has 147 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - ks_test - total: 0.127648 - median: 0.000529 2021-03-06 10:46:38 :: 1582754 :: algorithm run count - mean_subtraction_cumulation run 40 times 2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - mean_subtraction_cumulation has 40 timings 2021-03-06 10:46:38 :: 1582754 :: algorithm timing - mean_subtraction_cumulation - total: 0.152515 - median: 0.003152 2021-03-06 10:46:39 :: 1582754 :: algorithm run count - median_absolute_deviation run 35 times 2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - median_absolute_deviation has 35 timings 2021-03-06 10:46:39 :: 1582754 :: algorithm timing - median_absolute_deviation - total: 0.143770 - median: 0.003248 2021-03-06 10:46:39 :: 1582754 :: algorithm run count - stddev_from_moving_average run 30 times 2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - stddev_from_moving_average has 30 timings 2021-03-06 10:46:39 :: 1582754 :: algorithm timing - stddev_from_moving_average - total: 0.125173 - median: 0.003092 2021-03-06 10:46:39 :: 1582754 :: algorithm run count - least_squares run 16 times 2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - least_squares has 16 timings 2021-03-06 10:46:39 :: 1582754 :: algorithm timing - least_squares - total: 0.089108 - median: 0.005538 2021-03-06 10:46:39 :: 1582754 :: algorithm run count - abs_stddev_from_median run 1 times 2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - abs_stddev_from_median has 1 timings 2021-03-06 10:46:39 :: 1582754 :: algorithm timing - abs_stddev_from_median - total: 0.036797 - median: 0.036797 2021-03-06 10:46:39 :: 1582754 :: algorithm run count - adtk_level_shift run 271 times 2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - adtk_level_shift has 271 timings 2021-03-06 10:46:39 :: 1582754 :: algorithm timing - adtk_level_shift - total: 13.729565 - median: 0.035791 ... ... 2021-03-06 10:46:39 :: 1582754 :: seconds to run :: 27.93 # THE TOTAL ANALYZER RUNTIME Therefore the analysis methodology implemented for the adtk_level_shift custom_algorithm is as folows: - When new metrics are added either to the configuration or by actual new metrics coming online that match the ``algorithm_parameters['namespace']``, Skyline implements sharding on new metrics into time slots to prevent a thundering herd situation from developing. A newly added metrics will eventually be assigned into a time shard and be added and the last analysed timestamp will be added to the ``analyzer.last.adtk_level_shift`` Redis hash key to determine the next scheduled run with ``algorithm_parameters['namespace']`` - A ``run_every`` parameter is implemented so that the algorithm can be configured to run on a metric once every ``run_every`` minutes. The default is to run it every 5 minutes using window 5 (rolling) and trigger as anomalous if the algorithm labels any of the last 5 datapoints as anomalous. This means that there could be up to a 5 minute delay on an alert on the 60 second, 168 SECOND_ORDER_RESOLUTION_HOURS metrics in the example, but a ``c=9.0`` level shift would be detected and would be alerted on (if both analyzer and mirage triggered on it). This periodic running of the algorithm is a tradeoff so that the adtk_level_shift load and runtime can be spread over ``run_every`` minutes. - The algorithm is not run against metrics that are sparsely populated. When the algorithm is run on sparsely populated metrics it results in lots of false positives and noise. The Skyline CUSTOM_ALGORITHMS implementation of the adtk LevelShiftAD algorithm is configured as the example shown below. However please note that the algorithm_parameters shown in this example configuration are suitiable for metrics that have a 60 second relation and have a :mod:`settings.ALERTS` Mirage SECOND_ORDER_RESOLUTION_HOURS of 168 (7 days). For metrics with a different resolution/frequency may require different values appropriate for metric resolution. : Example CUSTOM_ALGORITHMS configuration: 'adtk_level_shift': { 'namespaces': [ 'skyline.analyzer.run_time', 'skyline.analyzer.total_metrics', 'skyline.analyzer.exceptions' ], 'algorithm_source': '/opt/skyline/github/skyline/skyline/custom_algorithms/adtk_level_shift.py', 'algorithm_parameters': {'c': 9.0, 'run_every': 5, 'side': 'both', 'window': 5}, 'max_execution_time': 0.5, 'consensus': 1, 'algorithms_allowed_in_consensus': ['adtk_level_shift'], 'run_3sigma_algorithms': True, 'run_before_3sigma': True, 'run_only_if_consensus': False, 'use_with': ["analyzer", "mirage"], 'debug_logging': False, }, """ # You MUST define the algorithm_name algorithm_name = 'adtk_level_shift' # Define the default state of None and None, anomalous does not default to # False as that is not correct, False is only correct if the algorithm # determines the data point is not anomalous. The same is true for the # anomalyScore. anomalous = None anomalyScore = None # @aded 20210308 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification return_anomalies = False anomalies = [] realtime_analysis = True current_logger = None # If you wanted to log, you can but this should only be done during # testing and development def get_log(current_skyline_app): current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) return current_logger start = timer() # Use the algorithm_parameters to determine the sample_period debug_logging = None try: debug_logging = algorithm_parameters['debug_logging'] except: debug_logging = False if debug_logging: try: current_logger = get_log(current_skyline_app) current_logger.debug('debug :: %s :: debug_logging enabled with algorithm_parameters - %s' % ( algorithm_name, str(algorithm_parameters))) except: # This except pattern MUST be used in ALL custom algortihms to # facilitate the traceback from any errors. The algorithm we want to # run super fast and without spamming the log with lots of errors. # But we do not want the function returning and not reporting # anything to the log, so the pythonic except is used to "sample" any # algorithm errors to a tmp file and report once per run rather than # spewing tons of errors into the log e.g. analyzer.log record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) # Return None and None as the algorithm could not determine True or False return (False, None) # Allow the LevelShiftAD window parameter to be passed in the # algorithm_parameters window = 5 try: window = algorithm_parameters['window'] except: pass # Allow the LevelShiftAD c parameter to be passed in the # algorithm_parameters c = 9.0 try: c = algorithm_parameters['c'] except: pass run_every = window try: run_every = algorithm_parameters['run_every'] except: pass side = 'both' try: side = algorithm_parameters['side'] except: pass if debug_logging: current_logger.debug('debug :: algorithm_parameters :: %s' % ( str(algorithm_parameters))) # @added 20210308 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification try: return_anomalies = algorithm_parameters['return_anomalies'] except: return_anomalies = False try: realtime_analysis = algorithm_parameters['realtime_analysis'] except: realtime_analysis = True # @added 20210316 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification save_plots_to = False try: save_plots_to = algorithm_parameters['save_plots_to'] except: pass # @added 20210323 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification save_plots_to_absolute_dir = False try: save_plots_to_absolute_dir = algorithm_parameters['save_plots_to_absolute_dir'] except: pass filename_prefix = False try: filename_prefix = algorithm_parameters['filename_prefix'] except: pass # @added 20210318 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification run_PersistAD = False try: run_PersistAD = algorithm_parameters['run_PersistAD'] except: pass if debug_logging: current_logger.debug('debug :: algorithm_parameters :: %s' % ( str(algorithm_parameters))) try: base_name = algorithm_parameters['base_name'] except: # This except pattern MUST be used in ALL custom algortihms to # facilitate the traceback from any errors. The algorithm we want to # run super fast and without spamming the log with lots of errors. # But we do not want the function returning and not reporting # anything to the log, so the pythonic except is used to "sample" any # algorithm errors to a tmp file and report once per run rather than # spewing tons of errors into the log e.g. analyzer.log record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) # Return None and None as the algorithm could not determine True or False if return_anomalies: return (False, None, anomalies) else: return (False, None) if debug_logging: current_logger.debug('debug :: %s :: base_name - %s' % ( algorithm_name, str(base_name))) # Due to the load and runtime of LevelShiftAD it is only run in analyzer # periodically if current_skyline_app == 'analyzer': redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) last_hash_key = 'analyzer.last.%s' % algorithm_name last_check = None try: raw_last_check = redis_conn_decoded.hget(last_hash_key, base_name) last_check = int(raw_last_check) except: last_check = None last_window_timestamps = [int(item[0]) for item in timeseries[-run_every:]] if last_check in last_window_timestamps: if debug_logging: current_logger.debug('debug :: %s :: run_every period is not over yet, skipping base_name - %s' % ( algorithm_name, str(base_name))) if return_anomalies: return (False, None, anomalies) else: return (False, None) # If there is no last timestamp, shard the metric, it will eventually # be added. if not last_check: now = datetime.datetime.now() now_seconds = int(now.second) if now_seconds == 0: now_seconds = 1 period_seconds = int(60 / run_every) shard = int(period_seconds) last_shard = 60 shard = int(period_seconds) shards = [shard] while shard < last_shard: shard = shard + period_seconds shards.append((shard)) shard_value = round(now_seconds / shards[0]) * shards[0] if shard_value <= shards[0]: shard_value = shards[0] metric_as_bytes = str(base_name).encode() value = zlib.adler32(metric_as_bytes) shard_index = [(index + 1) for index, s_value in enumerate(shards) if s_value == shard_value][0] modulo_result = value % shard_index if modulo_result == 0: if debug_logging: current_logger.debug('debug :: %s :: skipping as not sharded into this run - %s' % ( algorithm_name, str(base_name))) if return_anomalies: return (False, None, anomalies) else: return (False, None) if debug_logging: current_logger.debug('debug :: %s :: analysing %s' % ( algorithm_name, str(base_name))) try: int_metric_timestamp = int(timeseries[-1][0]) except: int_metric_timestamp = 0 if int_metric_timestamp: try: redis_conn_decoded.hset( last_hash_key, base_name, int_metric_timestamp) except: pass # ALWAYS WRAP YOUR ALGORITHM IN try and the BELOW except try: start_preprocessing = timer() # INFO: Sorting time series of 10079 data points took 0.002215 seconds timeseries = sorted(timeseries, key=lambda x: x[0]) if debug_logging: current_logger.debug('debug :: %s :: time series of length - %s' % ( algorithm_name, str(len(timeseries)))) # Testing the data to ensure it meets minimum requirements, in the case # of Skyline's use of the LevelShiftAD algorithm this means that: # - the time series must have at least 75% of its full_duration # - the time series must have at least 99% of the data points for the # in the sample being analysed. do_not_use_sparse_data = False if current_skyline_app == 'analyzer': do_not_use_sparse_data = True # @added 20210305 - Feature #3970: custom_algorithm - adtk_level_shift # Task #3664:: POC with adtk # With mirage also do not run LevelShiftAD on sparsely populated data if current_skyline_app == 'mirage': do_not_use_sparse_data = True # @aded 20210309 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification if current_skyline_app == 'luminosity': do_not_use_sparse_data = True if do_not_use_sparse_data: total_period = 0 total_datapoints = 0 try: start_timestamp = int(timeseries[0][0]) end_timestamp = int(timeseries[-1][0]) total_period = end_timestamp - start_timestamp total_datapoints = len(timeseries) except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: debug_logging :: %s :: failed to determine total_period and total_datapoints' % ( algorithm_name)) timeseries = [] if not timeseries: if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) if current_skyline_app == 'analyzer': # Default for analyzer at required period to 18 hours period_required = int(FULL_DURATION * 0.75) else: # Determine from timeseries if total_period < FULL_DURATION: period_required = int(FULL_DURATION * 0.75) else: period_required = int(total_period * 0.75) # If the time series does not have 75% of its full_duration it does not # have sufficient data to sample try: if total_period < period_required: if debug_logging: current_logger.debug('debug :: %s :: time series does not have sufficient data' % ( algorithm_name)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: debug_logging :: %s :: falied to determine if time series has sufficient data' % ( algorithm_name)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) # If the time series does not have 75% of its full_duration data points # it does not have sufficient data to sample # Determine resolution from the last 30 data points # INFO took 0.002060 seconds resolution_timestamps = [] metric_resolution = False for metric_datapoint in timeseries[-30:]: timestamp = int(metric_datapoint[0]) resolution_timestamps.append(timestamp) timestamp_resolutions = [] if resolution_timestamps: last_timestamp = None for timestamp in resolution_timestamps: if last_timestamp: resolution = timestamp - last_timestamp timestamp_resolutions.append(resolution) last_timestamp = timestamp else: last_timestamp = timestamp try: del resolution_timestamps except: pass if timestamp_resolutions: try: timestamp_resolutions_count = Counter(timestamp_resolutions) ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common() metric_resolution = int(ordered_timestamp_resolutions_count[0][0]) except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: debug_logging :: %s :: failed to determine if time series has sufficient data' % ( algorithm_name)) try: del timestamp_resolutions except: pass minimum_datapoints = None if metric_resolution: minimum_datapoints = int(period_required / metric_resolution) if minimum_datapoints: if total_datapoints < minimum_datapoints: if debug_logging: current_logger.debug('debug :: %s :: time series does not have sufficient data, minimum_datapoints required is %s and time series has %s' % ( algorithm_name, str(minimum_datapoints), str(total_datapoints))) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) # Is the time series fully populated? # full_duration_datapoints = int(full_duration / metric_resolution) total_period_datapoints = int(total_period / metric_resolution) # minimum_percentage_sparsity = 95 minimum_percentage_sparsity = 90 sparsity = int(total_datapoints / (total_period_datapoints / 100)) if sparsity < minimum_percentage_sparsity: if debug_logging: current_logger.debug('debug :: %s :: time series does not have sufficient data, minimum_percentage_sparsity required is %s and time series has %s' % ( algorithm_name, str(minimum_percentage_sparsity), str(sparsity))) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) if len(set(item[1] for item in timeseries)) == 1: if debug_logging: current_logger.debug('debug :: %s :: time series does not have sufficient variability, all the values are the same' % algorithm_name) anomalous = False anomalyScore = 0.0 if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) end_preprocessing = timer() preprocessing_runtime = end_preprocessing - start_preprocessing if debug_logging: current_logger.debug('debug :: %s :: preprocessing took %.6f seconds' % ( algorithm_name, preprocessing_runtime)) if not timeseries: if debug_logging: current_logger.debug('debug :: %s :: LevelShiftAD not run as no data' % ( algorithm_name)) anomalies = [] if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) else: if debug_logging: current_logger.debug('debug :: %s :: timeseries length: %s' % ( algorithm_name, str(len(timeseries)))) if len(timeseries) < 100: if debug_logging: current_logger.debug('debug :: %s :: time series does not have sufficient data' % ( algorithm_name)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) start_analysis = timer() try: df = pd.DataFrame(timeseries, columns=['date', 'value']) df['date'] = pd.to_datetime(df['date'], unit='s') datetime_index = pd.DatetimeIndex(df['date'].values) df = df.set_index(datetime_index) df.drop('date', axis=1, inplace=True) s = validate_series(df) level_shift_ad = LevelShiftAD(c=c, side=side, window=window) anomaly_df = level_shift_ad.fit_detect(s) anomalies = anomaly_df.loc[anomaly_df['value'] > 0] anomalous = False if len(anomalies) > 0: anomaly_timestamps = list(anomalies.index.astype(np.int64) // 10**9) if realtime_analysis: last_window_timestamps = [int(item[0]) for item in timeseries[-window:]] # if timeseries[-1][0] in anomaly_timestamps: for timestamp in last_window_timestamps: if timestamp in anomaly_timestamps: anomalous = True break else: anomalous = True # Convert anomalies dataframe to anomalies_list anomalies_list = [] # @added 20210316 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification # Convert anomalies dataframe to anomalies_dict anomalies_dict = {} anomalies_dict['metric'] = base_name anomalies_dict['timestamp'] = int(timeseries[-1][0]) anomalies_dict['from_timestamp'] = int(timeseries[0][0]) anomalies_dict['algorithm'] = algorithm_name anomalies_dict['anomalies'] = {} for ts, value in timeseries: if int(ts) in anomaly_timestamps: anomalies_list.append([int(ts), value]) anomalies_dict['anomalies'][int(ts)] = value anomalies = list(anomalies_list) # @added 20210316 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification if save_plots_to: try: from adtk.visualization import plot metric_dir = base_name.replace('.', '/') timestamp_dir = str(int(timeseries[-1][0])) save_path = '%s/%s/%s/%s' % ( save_plots_to, algorithm_name, metric_dir, timestamp_dir) if save_plots_to_absolute_dir: save_path = '%s' % save_plots_to anomalies_dict['file_path'] = save_path save_to_file = '%s/%s.%s.png' % ( save_path, algorithm_name, base_name) if filename_prefix: save_to_file = '%s/%s.%s.%s.png' % ( save_path, filename_prefix, algorithm_name, base_name) save_to_path = os_path_dirname(save_to_file) title = '%s\n%s' % (algorithm_name, base_name) if not os_path_exists(save_to_path): try: mkdir_p(save_to_path) except Exception as e: current_logger.error('error :: %s :: failed to create dir - %s - %s' % ( algorithm_name, save_to_path, e)) if os_path_exists(save_to_path): try: plot(s, anomaly=anomaly_df, anomaly_color='red', title=title, save_to_file=save_to_file) if debug_logging: current_logger.debug('debug :: %s :: plot saved to - %s' % ( algorithm_name, save_to_file)) except Exception as e: current_logger.error('error :: %s :: failed to plot - %s - %s' % ( algorithm_name, base_name, e)) anomalies_file = '%s/%s.%s.anomalies_list.txt' % ( save_path, algorithm_name, base_name) with open(anomalies_file, 'w') as fh: fh.write(str(anomalies_list)) # os.chmod(anomalies_file, mode=0o644) data_file = '%s/data.txt' % (save_path) with open(data_file, 'w') as fh: fh.write(str(anomalies_dict)) except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called during save plot, exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except Exception as e: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: %s :: failed to plot or save anomalies file - %s - %s' % ( algorithm_name, base_name, e)) else: anomalies = [] # @added 20210318 - Feature #3978: luminosity - classify_metrics # Feature #3642: Anomaly type classification if anomalies and run_PersistAD and not realtime_analysis: persist_ad_algorithm_parameters = {} try: persist_ad_algorithm_parameters = algorithm_parameters['persist_ad_algorithm_parameters'] except: pass persist_ad_window = 20 try: persist_ad_window = persist_ad_algorithm_parameters['window'] except: pass persist_ad_c = 9.9 try: persist_ad_c = persist_ad_algorithm_parameters['c'] except: pass try: from adtk.detector import PersistAD persist_ad = PersistAD(c=persist_ad_c, side='both', window=persist_ad_window) persist_ad_anomaly_df = persist_ad.fit_detect(s) persist_ad_anomalies = persist_ad_anomaly_df.loc[persist_ad_anomaly_df['value'] > 0] if len(persist_ad_anomalies) > 0: current_logger.info('%s :: %s anomalies found with PersistAD on %s' % ( algorithm_name, str(len(persist_ad_anomalies)), base_name)) persist_ad_anomaly_timestamps = list(persist_ad_anomalies.index.astype(np.int64) // 10**9) # Convert persist_ad_anomalies dataframe to persist_ad_anomalies_list persist_ad_anomalies_list = [] persist_ad_anomalies_dict = {} persist_ad_anomalies_dict['metric'] = base_name persist_ad_anomalies_dict['timestamp'] = int(timeseries[-1][0]) persist_ad_anomalies_dict['from_timestamp'] = int(timeseries[0][0]) persist_ad_anomalies_dict['algorithm'] = 'adtk_PersistAD' persist_ad_anomalies_dict['anomalies'] = {} for ts, value in timeseries: if int(ts) in persist_ad_anomaly_timestamps: persist_ad_anomalies_list.append([int(ts), value]) persist_ad_anomalies_dict['anomalies'][int(ts)] = value persist_ad_anomalies = list(persist_ad_anomalies_list) if save_plots_to: try: from adtk.visualization import plot metric_dir = base_name.replace('.', '/') timestamp_dir = str(int(timeseries[-1][0])) save_path = '%s/%s/%s/%s' % ( save_plots_to, algorithm_name, metric_dir, timestamp_dir) if save_plots_to_absolute_dir: save_path = '%s' % save_plots_to persist_ad_anomalies_dict['file_path'] = save_path save_to_file = '%s/%s.PersistAD.%s.png' % ( save_path, algorithm_name, base_name) if filename_prefix: save_to_file = '%s/%s.%s.%s.png' % ( save_path, filename_prefix, algorithm_name, base_name) save_to_path = os_path_dirname(save_to_file) title = '%s - PersistAD verification\n%s' % (algorithm_name, base_name) if not os_path_exists(save_to_path): try: mkdir_p(save_to_path) except Exception as e: current_logger.error('error :: %s :: failed to create dir - %s - %s' % ( algorithm_name, save_to_path, e)) if os_path_exists(save_to_path): try: plot(s, anomaly=persist_ad_anomaly_df, anomaly_color='red', title=title, save_to_file=save_to_file) if debug_logging: current_logger.debug('debug :: %s :: plot saved to - %s' % ( algorithm_name, save_to_file)) except Exception as e: current_logger.error('error :: %s :: failed to plot - %s - %s' % ( algorithm_name, base_name, e)) anomalies_file = '%s/%s.%s.PersistAD.anomalies_list.txt' % ( save_path, algorithm_name, base_name) with open(anomalies_file, 'w') as fh: fh.write(str(persist_ad_anomalies)) # os.chmod(anomalies_file, mode=0o644) data_file = '%s/PersistAD.data.txt' % (save_path) with open(data_file, 'w') as fh: fh.write(str(persist_ad_anomalies_dict)) except Exception as e: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: %s :: failed to plot or save PersistAD anomalies file - %s - %s' % ( algorithm_name, base_name, e)) except Exception as e: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: %s :: failed to analysis with PersistAD anomalies file - %s - %s' % ( algorithm_name, base_name, e)) try: del df except: pass except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called, during analysis, exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except: traceback_msg = traceback.format_exc() record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg) if debug_logging: current_logger.error(traceback_msg) current_logger.error('error :: debug_logging :: %s :: failed to run on ts' % ( algorithm_name)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) end_analysis = timer() analysis_runtime = end_analysis - start_analysis if debug_logging: current_logger.debug('debug :: %s :: LevelShiftAD took %.6f seconds' % ( algorithm_name, analysis_runtime)) if anomalous: anomalyScore = 1.0 else: anomalyScore = 0.0 if debug_logging: current_logger.info('%s :: anomalous - %s, anomalyScore - %s' % ( algorithm_name, str(anomalous), str(anomalyScore))) if debug_logging: end = timer() processing_runtime = end - start current_logger.info('%s :: completed analysis in %.6f seconds' % ( algorithm_name, processing_runtime)) try: del timeseries except: pass if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except SystemExit as e: if debug_logging: current_logger.debug('debug_logging :: %s :: SystemExit called (before StopIteration), exiting - %s' % ( algorithm_name, e)) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore) except StopIteration: # This except pattern MUST be used in ALL custom algortihms to # facilitate the traceback from any errors. The algorithm we want to # run super fast and without spamming the log with lots of errors. # But we do not want the function returning and not reporting # anything to the log, so the pythonic except is used to "sample" any # algorithm errors to a tmp file and report once per run rather than # spewing tons of errors into the log e.g. analyzer.log if return_anomalies: return (False, None, anomalies) else: return (False, None) except: record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc()) # Return None and None as the algorithm could not determine True or False if return_anomalies: return (False, None, anomalies) else: return (False, None) if return_anomalies: return (anomalous, anomalyScore, anomalies) else: return (anomalous, anomalyScore)
def get_external_alert_configs(current_skyline_app): """ Return a concatenated alerts configs from :mod:`settings.EXTERNAL_ALERTS` of any fetched external alert configs, a all_alerts list which is a concentated and deduplicated list of the and whether it was retrieved from cache or fetched source. :param current_skyline_app: the app calling the function so the function knows which log to write too. :type current_skyline_app: str :return: (external_alert_configs, all_alerts, external_from_cache) :rtype: (dict, list, boolean) """ debug_get_external_alert_configs = None # Set the default dicts to return external_alert_configs = {} # Set the default dict to return internal_alert_configs = {} # Set the default all_alerts to return all_alerts = list(settings.ALERTS) all_alert_configs = None # Set the default external_from_cache to return external_from_cache = None # Set the default internal_from_cache to return internal_from_cache = None # Set the default all_from_cache to return all_from_cache = None last_known_redis_key = 'skyline.last_known.external_alert_configs' # Get the logger current_skyline_app_logger = str(current_skyline_app) + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) # Define the items that are expected in the external alert config json EXTERNAL_ALERTS_JSON_ITEMS = ('alerter', 'expiration', 'namespace', 'namespace_prefix', 'second_order_resolution', 'second_order_resolution_hours', 'learn_days', 'inactive_after') OPTIONAL_EXTERNAL_ALERTS_JSON_ITEMS = ('namespace_prefix', 'second_order_resolution_hours', 'learn_days', 'inactive_after') try: EXTERNAL_ALERTS = settings.EXTERNAL_ALERTS if debug_get_external_alert_configs: current_logger.debug( 'debug :: get_external_alert_configs settings.EXTERNAL_ALERTS is defined' ) except: return (external_alert_configs, external_from_cache, internal_alert_configs, internal_from_cache, tuple(all_alerts), all_from_cache) redis_conn_decoded = None try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to get decoded Redis connection' ) # The all_alert_configs Redis key is cached for 60 seconds, if found return # as it is all that is needed redis_key = 'skyline.all_alert_configs' raw_all_alert_configs = None if redis_conn_decoded: try: raw_all_alert_configs = redis_conn_decoded.get(redis_key) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to query Redis for skyline.all_alert_configs' ) if raw_all_alert_configs: try: all_alert_configs = literal_eval(raw_all_alert_configs) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to literal_eval skyline.all_alert_configs' ) if all_alert_configs: # Set that the external_alert_config was fetched from cache all_from_cache = True return (external_alert_configs, external_from_cache, internal_alert_configs, internal_from_cache, all_alert_configs, all_from_cache) redis_key = 'skyline.external_alert_configs' raw_external_alert_configs = None if redis_conn_decoded: try: raw_external_alert_configs = redis_conn_decoded.get(redis_key) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to query Redis for skyline.external_alert_configs' ) if raw_external_alert_configs: try: external_alert_configs = literal_eval(raw_external_alert_configs) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to literal_eval skyline.external_alert_configs' ) if external_alert_configs: # Set that the external_alert_config was fetched from cache external_from_cache = True if redis_conn_decoded: try: redis_conn_decoded.set(last_known_redis_key, str(external_alert_configs)) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to set %s Redis key' % last_known_redis_key) redis_key = 'skyline.internal_alert_configs' raw_internal_alert_configs = None if redis_conn_decoded: try: raw_internal_alert_configs = redis_conn_decoded.get(redis_key) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to query Redis for skyline.internal_alert_configs' ) if raw_internal_alert_configs: try: internal_alert_configs = literal_eval(raw_internal_alert_configs) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to literal_eval skyline.internal_alert_configs' ) if internal_alert_configs: # Set that the external_alert_config was fetched from cache internal_from_cache = True # If the external_alert_config was not fectched from cache build it if not external_alert_configs: for external_alert_config in EXTERNAL_ALERTS: external_alert_config_url = None try: external_alert_config_url = EXTERNAL_ALERTS[ external_alert_config]['url'] except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: could not determine url from EXTERNAL_ALERTS[\'%s\'][\'url\']' % (str(external_alert_config))) continue external_alert_config_method = None try: external_alert_config_method = EXTERNAL_ALERTS[ external_alert_config]['method'] except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: could not determine url from EXTERNAL_ALERTS[\'%s\'][\'method\']' % (str(external_alert_config))) continue external_alert_config_post_data = None if external_alert_config_method == 'POST' or external_alert_config_method == 'post': try: external_alert_config_post_data = EXTERNAL_ALERTS[ external_alert_config]['data'] except: external_alert_config_post_data = None external_alert_json = None try: current_logger.info( 'get_external_alert_configs :: retrieving alert config json for %s from %s via %s' % (str(external_alert_config), str(external_alert_config_url), str(external_alert_config_method))) if external_alert_config_method == 'GET': r = requests.get(external_alert_config_url, timeout=2) if external_alert_config_method == 'POST': header = {"content-type": "application/json"} if external_alert_config_post_data: r = requests.post( external_alert_config_url, data=json.dumps(external_alert_config_post_data), headers=header, timeout=2) else: r = requests.post(external_alert_config_url, headers=header, timeout=2) external_alert_json = r.json() except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: could not retrieve json from the url - %s' % str(external_alert_config_url)) continue if not external_alert_json: current_logger.error( 'error :: get_external_alert_configs :: did not retrieve json from the url - %s' % str(external_alert_config_url)) continue for alerter_id in external_alert_json['data']: config_id = 'external-%s' % str(alerter_id) alerter_config = {'id': config_id} namespace_prefix = None namespace = None for key in EXTERNAL_ALERTS_JSON_ITEMS: try: if key == 'namespace_prefix': try: namespace_prefix = external_alert_json['data'][ alerter_id][key] except: namespace_prefix = None elif key == 'namespace': namespace = external_alert_json['data'][ alerter_id][key] else: alerter_config[key] = external_alert_json['data'][ alerter_id][key] except: if key in OPTIONAL_EXTERNAL_ALERTS_JSON_ITEMS: if key == 'inactive_after': alerter_config[key] = 7200 continue else: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: could not determine %s from json - %s' % (key, str(alerter_id))) alerter_config = {} break if alerter_config: try: if namespace_prefix == namespace: full_namespace_str = namespace else: if namespace_prefix is None: full_namespace_str = namespace else: full_namespace_str = '%s.%s' % ( namespace_prefix, namespace) full_namespace = full_namespace_str.replace( ',', '.') alerter_config['namespace'] = full_namespace except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to interpolation full_namespace from namespace_prefix and namespace in the json - %s' % str(external_alert_json['data'][alerter_id])) continue try: alerter_config['type'] = 'external' except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to add type external to alerter_config' ) continue try: external_alert_configs[alerter_id] = alerter_config except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: could not add alert_config dict to external_alert_configs dict from json - %s' % str(external_alert_json['data'][alerter_id])) continue # If the key expired and no alerter_configs were constructed from the # external source then use the last known good external_alert_configs last_good_external_alert_configs = None if not external_alert_configs: if redis_conn_decoded: last_good_raw_external_alert_configs = None try: last_good_raw_external_alert_configs = redis_conn_decoded.get( last_known_redis_key) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to query Redis for %s' % last_known_redis_key) last_good_external_alert_configs = None if last_good_raw_external_alert_configs: try: last_good_external_alert_configs = literal_eval( last_good_raw_external_alert_configs) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to literal_eval skyline.last_known.external_alert_configs' ) if last_good_external_alert_configs: current_logger.info( 'get_external_alert_configs :: failed to construct the external_alert_configs using skyline.last_known.external_alert_configs' ) external_alert_configs = last_good_external_alert_configs external_from_cache = True # Build the all_alerts list by contenating the external_alert_configs new_all_alerts = [] if external_alert_configs: # external smtp alerts # All set to no_email in analyzer and mirage_alerters as every alert # must be routed through the smtp workflow, even if it does not send a # smtp alert, as the smtp alert route creates the the training data # resources. for external_alert_config in external_alert_configs: config_id = None namespace = None expiration = None second_order_resolution = None second_order_resolution_hours = None try: config_id = external_alert_configs[external_alert_config]['id'] except: continue try: namespace = external_alert_configs[external_alert_config][ 'namespace'] except: continue try: expiration = int(external_alert_configs[external_alert_config] ['expiration']) except: continue try: second_order_resolution = int( external_alert_configs[external_alert_config] ['second_order_resolution']) second_order_resolution_hours = int(second_order_resolution / 3600) except: continue # First add an smtp no_email alerter for the external_alert_config # this is required to route anomalies through the training_data # resources creation workflow # alert = ('metric5.thing.*.rpm', 'smtp', 900, 168), new_all_alerts.append([ namespace, 'smtp', expiration, second_order_resolution_hours, external_alert_configs[external_alert_config] ]) # internal smtp alerts for index, alert in enumerate(settings.ALERTS): # alert = ('metric5.thing.*.rpm', 'smtp', 900, 168), if str(alert[1]) == 'smtp': try: second_order_resolution_hours = int(alert[3]) second_order_resolution = second_order_resolution_hours * 3600 except: second_order_resolution = 0 config_id = 'internal-%s' % str(index) internal_alert_config = { 'id': config_id, 'alerter': alert[1], 'namespace': alert[0], 'expiration': alert[2], 'second_order_resolution': second_order_resolution, 'inactive_after': 7200, 'type': 'internal' } new_all_alerts.append([ alert[0], alert[1], alert[2], second_order_resolution_hours, internal_alert_config ]) try: internal_alert_configs[index] = internal_alert_config except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: could not add internal_alert_config dict to internal_alert_configs dict' ) continue # external alerts - non-smtp if external_alert_configs: for external_alert_config in external_alert_configs: config_id = None alerter = None namespace = None expiration = None second_order_resolution = None second_order_resolution_hours = 0 try: config_id = external_alert_configs[external_alert_config]['id'] except: continue try: alerter = external_alert_configs[external_alert_config][ 'alerter'] except: continue try: namespace = external_alert_configs[external_alert_config][ 'namespace'] except: continue try: expiration = int(external_alert_configs[external_alert_config] ['expiration']) except: continue try: second_order_resolution = int( external_alert_configs[external_alert_config] ['second_order_resolution']) second_order_resolution_hours = int(second_order_resolution / 3600) except: continue # First add an smtp no_email alerter for the external_alert_config # this is required to route anomalies through the training_data # resources creation workflow # alert = ('metric5.thing.*.rpm', 'smtp', 900, 168), new_all_alerts.append([ namespace, alerter, expiration, second_order_resolution_hours, external_alert_configs[external_alert_config] ]) # internal non smtp alerts for index, alert in enumerate(settings.ALERTS): # alert = ('metric5.thing.*.rpm', 'smtp', 900, 168), if str(alert[1]) != 'smtp': try: second_order_resolution_hours = int(alert[3]) second_order_resolution = second_order_resolution_hours * 3600 except: second_order_resolution_hours = 0 config_id = 'internal-%s' % str(index) internal_alert_config = { 'id': config_id, 'alerter': alert[1], 'namespace': alert[0], 'expiration': alert[2], 'second_order_resolution': second_order_resolution, 'inactive_after': 7200, 'type': 'internal' } new_all_alerts.append([ alert[0], alert[1], alert[2], second_order_resolution_hours, internal_alert_config ]) try: internal_alert_configs[index] = internal_alert_config except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: could not add internal_alert_config dict to internal_alert_configs dict' ) continue if new_all_alerts: all_alerts = tuple(new_all_alerts) if redis_conn_decoded and external_alert_configs: if not external_from_cache: redis_key = 'skyline.external_alert_configs' try: redis_conn_decoded.setex(redis_key, 300, str(external_alert_configs)) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to set %s' % redis_key) if redis_conn_decoded and internal_alert_configs: if not internal_from_cache: redis_key = 'skyline.internal_alert_configs' try: redis_conn_decoded.setex(redis_key, 60, str(internal_alert_configs)) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to set %s' % redis_key) if redis_conn_decoded and all_alerts: if not all_from_cache: redis_key = 'skyline.all_alert_configs' try: redis_conn_decoded.setex(redis_key, 60, str(all_alerts)) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: get_external_alert_configs :: failed to set %s' % redis_key) return (external_alert_configs, external_from_cache, internal_alert_configs, internal_from_cache, all_alerts, all_from_cache)
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log # In Vista the log management is handled be fetcher, the worker just # waits for the fetcher to do the log managment now = int(time()) log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = int(time()) else: now = log_wait_for + 1 logger.info('worker :: starting log management') if os.path.isfile(skyline_app_loglock): logger.error( 'error :: worker :: bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os_remove(skyline_app_loglock) logger.info('worker :: log lock file removed') except OSError: logger.error( 'error :: worker :: failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('worker :: bin/%s.d log management done' % skyline_app) logger.info('worker :: starting worker') try: VISTA_ENABLED = settings.VISTA_ENABLED logger.info('worker :: VISTA_ENABLED is set to %s' % str(VISTA_ENABLED)) except: VISTA_ENABLED = False logger.info( 'worker :: warning :: VISTA_ENABLED is not declared in settings.py, defaults to False' ) last_sent_to_graphite = int(time()) metrics_sent_to_flux = 0 # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() if LOCAL_DEBUG: logger.info('worker :: redis is up') except: logger.error( 'worker :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) metrics_data = [] redis_set = 'vista.fetcher.metrics.json' try: # Get a metric to validate from the Redis set # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # metrics_data = self.redis_conn.smembers(redis_set) metrics_data = self.redis_conn_decoded.smembers(redis_set) if LOCAL_DEBUG: logger.info('worker :: got redis set data - %s' % redis_set) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: retrieving Redis set %s data' % str(redis_set)) if not metrics_data: if LOCAL_DEBUG: logger.info('worker :: no data from Redis set %s' % str(redis_set)) sleep(5) for str_metric_data in metrics_data: delete_set_record = False remote_host_type = None try: # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Rather using get_redis_conn_decoded # if python_version == 3: # str_metric_data = str_metric_data.decode('UTF-8') metric_data = literal_eval(str_metric_data) remote_host_type = str(metric_data[0]['remote_host_type']) if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for remote_host_type %s' % str(remote_host_type)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine remote_host_type from %s' % str(str_metric_data)) delete_set_record = True if not delete_set_record: try: remote_target = str(metric_data[0]['remote_target']) if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for target %s' % str(remote_target)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine target from %s' % str(str_metric_data)) delete_set_record = True metric = None if not delete_set_record: try: metric = str(metric_data[0]['metric']) if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for metric %s' % str(metric)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine metric from %s' % str(str_metric_data)) delete_set_record = True namespace_prefix = '' if not delete_set_record: try: namespace_prefix = str( metric_data[0]['namespace_prefix']) namespace_prefix = '%s.' % namespace_prefix if not namespace_prefix: namespace_prefix = '' if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for namespace_prefix %s' % str(namespace_prefix)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine namespace_prefix from %s' % str(str_metric_data)) delete_set_record = True have_data = False if not delete_set_record: last_flux_metric_data = None cache_key = 'flux.last.%s' % (metric) try: if python_version == 3: redis_last_flux_metric_data = self.redis_conn.get( cache_key).decode('UTF-8') else: redis_last_flux_metric_data = self.redis_conn.get( cache_key) redis_last_flux_metric_data = redis_last_flux_metric_data last_flux_metric_data = literal_eval( redis_last_flux_metric_data) if LOCAL_DEBUG: logger.info( 'worker :: got last_flux_metric_data from Redis' ) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: retrieving Redis key %s data' % str(cache_key)) last_flux_metric_data = False last_flux_timestamp = None if last_flux_metric_data: try: last_flux_timestamp = int(last_flux_metric_data[0]) if LOCAL_DEBUG: logger.info( 'worker :: got last_flux_timestamp - %s' % str(last_flux_timestamp)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed determining last_flux_timestamp' ) last_flux_timestamp = False # Determine the timestamp of the current minute to apply # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE time_now = int(time()) # current_minute = datetime.datetime.utcfromtimestamp(time_now).strftime('%Y-%m-%d %H:%M') current_minute_hour = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%H')) current_minute_minute = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%M')) current_datetime = datetime.datetime.utcfromtimestamp( time_now).replace(hour=current_minute_hour, minute=current_minute_minute, second=0, microsecond=0) current_minute_timestamp_start = int( current_datetime.strftime('%s')) datapoint = None last_timestamp_with_data = None timeseries = [] # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data metric_resolution = 60 metric_resolution_determined = False try: if python_version == 3: datapoints_str = literal_eval( metric_data[0]['datapoints']) metric_datapoints = literal_eval(datapoints_str) else: # metric_datapoints = metric_data[0]['datapoints'] datapoints_str = literal_eval( metric_data[0]['datapoints']) metric_datapoints = literal_eval(datapoints_str) # for value, timestamp in metric_data[0]['datapoints']: if LOCAL_DEBUG: len_metric_datapoints = len(metric_datapoints) logger.info( 'worker :: got %s metric_datapoints - %s' % (str(len_metric_datapoints), str(metric_datapoints))) # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data # Determine resolution resolution_timestamps = [] for metric_datapoint in metric_datapoints: timestamp = int(metric_datapoint[0]) resolution_timestamps.append(timestamp) timestamp_resolutions = [] if resolution_timestamps: last_timestamp = None for timestamp in resolution_timestamps: if last_timestamp: resolution = timestamp - last_timestamp timestamp_resolutions.append(resolution) last_timestamp = timestamp else: last_timestamp = timestamp if timestamp_resolutions: try: timestamp_resolutions_count = Counter( timestamp_resolutions) ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common( ) metric_resolution = int( ordered_timestamp_resolutions_count[0][0]) if metric_resolution > 0: metric_resolution_determined = True except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine metric_resolution from %s' % (str(metric_data))) if metric_resolution_determined: cache_key = 'vista.last.resolution.%s' % metric try: # Update Redis key self.redis_conn.setex(cache_key, 3600, metric_resolution) except: logger.error(traceback.format_exc()) logger.error( 'error :: fetcher :: failed to set Redis key - %s' % (cache_key)) for metric_datapoint in metric_datapoints: # @20191010 - Branch #3140: vista # fetcher passes through preformatted data points that # are in the same format/order for both graphite and # prometheus # if remote_host_type == 'graphite': # value = float(metric_datapoint[0]) # timestamp = int(metric_datapoint[1]) # if remote_host_type == 'prometheus': # value = float(metric_datapoint[1]) # timestamp = int(metric_datapoint[0]) timestamp = int(metric_datapoint[0]) value = float(metric_datapoint[1]) append_to_timeseries = False if last_flux_timestamp: if int(timestamp) > last_flux_timestamp: # timeseries.append([timestamp, value]) append_to_timeseries = True else: # timeseries.append([timestamp, value]) append_to_timeseries = True # Here if the timestamp of the data point falls # within the current minute, it is discarded and not # sent to flux, to ensure that high frequency metrics # can have their minutely bins fully populated before # they are submitted to Graphite if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE: if int(timestamp ) >= current_minute_timestamp_start: append_to_timeseries = False if append_to_timeseries: timeseries.append([timestamp, value]) last_timestamp_with_data = 0 for timestamp, value in timeseries[::-1]: has_value = False if value == 0.0: has_value = True if value: has_value = True if has_value: last_timestamp_with_data = int(timestamp) datapoint = value break if last_timestamp_with_data: have_data = True except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine datapoints from %s' % (str(metric_data))) delete_set_record = True if not timeseries: logger.info( 'worker :: after processing, there were no valid data points in %s' % (str(metric_data))) delete_set_record = True if not have_data and timeseries: logger.error( 'error :: worker :: failed to determine last_timestamp_with_data from %s' % (str(metric_data))) delete_set_record = True if delete_set_record: try: redis_set = 'vista.fetcher.metrics.json' self.redis_conn.srem(redis_set, str_metric_data) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to delete data from Redis set %s, data - ' % (str(redis_set), str(str_metric_data))) continue if not metric: continue valid_data = True if last_flux_timestamp and last_timestamp_with_data: if int(last_timestamp_with_data) <= last_flux_timestamp: valid_data = False if not valid_data: redis_set = 'vista.fetcher.metrics.json' logger.info( 'worker :: no valid data in fetched data removing from Redis set %s - data - %s' % (redis_set, str(str_metric_data))) try: self.redis_conn.srem(redis_set, str_metric_data) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to delete data from Redis set %s, data - %s' % (redis_set, str(str_metric_data))) continue if valid_data: flux_host = 'http://%s:%s' % (settings.FLUX_IP, settings.FLUX_PORT) # Resample resample_at = None if resample_at == 'none' or resample_at == '0Min': resample_at = False if resample_at == 'None' or resample_at == '0min': resample_at = False if resample_at is None or resample_at == '0' or resample_at == 0: resample_at = False if resample_at: try: df = pd.DataFrame(timeseries) df.columns = ['timestamp', 'value'] df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', origin='unix') df = df.set_index('timestamp') resampled_df = df.resample(resample_at).sum() resampled_timeseries = [] for index, row in resampled_df.iterrows(): timestamp = int(index.strftime('%s')) resampled_timeseries.append( [timestamp, row[0]]) timeseries = resampled_timeseries timeseries_length = len(timeseries) logger.info( 'worker :: time series resampled at %s resulting in %s data points to send to Graphite' % (str(resample_at), str(timeseries_length))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to resample time series at %s for %s with time series %s' % (str(resample_at), str(metric), str(timeseries))) for timestamp, value in timeseries: flux_url = '%s/metric_data?metric=%s&value=%s×tamp=%s&key=%s' % ( flux_host, metric, str(datapoint), str(timestamp), settings.FLUX_SELF_API_KEY) success = False try: response = requests.get(flux_url) if response.status_code == 200: success = True elif response.status_code == 204: success = True except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to request %s' % str(flux_url)) if not success: logger.error( 'error :: worker :: http status code - %s, reason - %s' % (str(response.status_code), str(response.reason))) if success: metrics_sent_to_flux += 1 redis_set = 'vista.fetcher.metrics.json' # @added 20191011 - Task #3258: Reduce vista logging timeseries_length = len(timeseries) # @modified 20191011 - Task #3258: Reduce vista logging # logger.info('worker :: data submitted to flux OK, removing data from Redis set %s' % ( # redis_set)) logger.info( 'worker :: %s data points submitted to flux OK for %s' % (str(timeseries_length), metric)) try: self.redis_conn.srem(redis_set, str_metric_data) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to delete data from Redis set %s, data - %s' % (redis_set, str(str_metric_data))) redis_set = 'vista.fetcher.unique_metrics' try: self.redis_conn.sadd(redis_set, remote_target) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to add %s to Redis set %s' % (remote_target, redis_set)) time_now = int(time()) if (time_now - last_sent_to_graphite) >= 60: logger.info( 'worker :: metrics sent_to_flux in last 60 seconds - %s' % str(metrics_sent_to_flux)) send_metric_name = '%s.metrics_sent_to_flux' % skyline_app_graphite_namespace try: send_graphite_metric(parent_skyline_app, send_metric_name, str(metrics_sent_to_flux)) last_sent_to_graphite = int(time()) metrics_sent_to_flux = 0 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (send_metric_name, str(metrics_sent_to_flux)))
def get_boundary_metrics(current_skyline_app, metrics, namespaces, cluster_data=False, log=False): """ Determine all the boundary metrics and return a dictionary of them and their algorithms. :param current_skyline_app: the app calling the function :param metrics: a list of base_names :param namespaces: a list of namespace pattern to match :param cluster_data: whether this is a cluster_data request, optional, defaults to False :param log: whether to log or not, optional, defaults to False :type current_skyline_app: str :type metrics: list :type namespace: list :type cluster_data: boolean :type log: boolean :return: boundary_metrics :rtype: dict """ boundary_metrics = {} function_str = 'get_boundary_metrics' filter_by_metrics = [] if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.info('%s :: %s :: determining boundary_metrics' % (current_skyline_app, function_str)) else: current_logger = None try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: get_redis_conn_decoded failed - %s' % (current_skyline_app, function_str, e)) raise boundary_metrics_redis_dict = {} try: boundary_metrics_redis_dict = redis_conn_decoded.hgetall( 'metrics_manager.boundary_metrics') if log: current_logger.info( '%s :: %s :: got %s boundary metrics from metrics_manager.boundary_metrics' % (current_skyline_app, function_str, str(len(boundary_metrics_redis_dict)))) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis hash key metrics_manager.boundary_metrics - %s' % (function_str, e)) raise boundary_metrics = boundary_metrics_redis_dict.copy() remote_boundary_metrics = [] if settings.REMOTE_SKYLINE_INSTANCES and cluster_data: boundary_metrics_uri = 'boundary_metrics' try: remote_boundary_metrics = get_cluster_data(boundary_metrics_uri, 'boundary_metrics') except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get boundary_metrics from remote instances - %s' % (function_str, e)) raise if remote_boundary_metrics: if log: current_logger.info( 'got %s remote boundary_metrics from the remote Skyline instances' % str(len(remote_boundary_metrics))) for remote_data in remote_boundary_metrics: for base_name in list(remote_data.keys()): boundary_metrics[base_name] = remote_data[base_name] if metrics: for metric in metrics: filter_by_metrics.append(metric) unique_base_names = [] if namespaces: redis_key = 'analyzer.metrics_manager.db.metric_names' try: unique_base_names = list(redis_conn_decoded.smembers(redis_key)) if unique_base_names: if log: current_logger.info( '%s :: %s :: got %s unique_base_names' % (current_skyline_app, function_str, str(len(unique_base_names)))) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: failed to get Redis key %s - %s' % (current_skyline_app, function_str, redis_key, e)) raise for base_name in unique_base_names: try: pattern_match, metric_matched_by = matched_or_regexed_in_list( current_skyline_app, base_name, namespaces) if pattern_match: filter_by_metrics.append(base_name) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: failed to get Redis key %s - %s' % (current_skyline_app, function_str, redis_key, e)) if filter_by_metrics: if log: current_logger.info('%s :: %s :: filtering on %s metrics' % (current_skyline_app, function_str, str(len(filter_by_metrics)))) filtered_boundary_metrics = {} for base_name in list(set(filter_by_metrics)): boundary_metric_dict = None try: boundary_metric_dict = boundary_metrics_redis_dict[base_name] except: continue if boundary_metric_dict: filtered_boundary_metrics[base_name] = boundary_metric_dict if filtered_boundary_metrics: boundary_metrics = filtered_boundary_metrics.copy() if log: current_logger.info( '%s :: %s :: filtered %s boundary_metrics' % (current_skyline_app, function_str, str(len(boundary_metrics)))) return boundary_metrics
def run(self): """ Called when the process intializes. """ def pickle_data_to_graphite(data): message = None try: payload = pickle.dumps(data, protocol=2) header = struct.pack("!L", len(payload)) message = header + payload except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to pickle to send to Graphite' ) return False if message: try: sock = socket.socket() sock.connect((CARBON_HOST, FLUX_CARBON_PICKLE_PORT)) sock.sendall(message) sock.close() except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to send pickle data to Graphite' ) return False else: logger.error( 'error :: populate_metric_worker :: failed to pickle metric data into message' ) return False return True logger.info('populate_metric_worker :: starting worker') # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error( 'populate_metric_worker :: cannot connect to Redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = get_redis_conn(skyline_app) # @added 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) metricDict = None try: # Get a metric from the queue with a 1 second timeout, each # metric item on the queue is a list e.g. # metric_json = [metricName, metricValue, metricTimestamp] metricDict = self.q.get(True, 1) logger.info('populate_metric_worker :: processing queue item') except Empty: logger.info( 'populate_metric_worker :: queue is empty and timed out, sleeping for 30 seconds' ) sleep(30) except NotImplementedError: pass except KeyboardInterrupt: logger.info( 'populate_metric_worker :: server has been issued a user signal to terminate - KeyboardInterrupt' ) except SystemExit: logger.info( 'populate_metric_worker :: server was interrupted - SystemExit' ) except Exception as e: logger.error('error :: populate_metric_worker :: %s' % (str(e))) if not metricDict: continue try: remote_host_type = str(metricDict['remote_host_type']) remote_target = str(metricDict['remote_target']) metric = str(metricDict['metric']) namespace_prefix = str(metricDict['namespace_prefix']) if not namespace_prefix: namespace_prefix = '' if namespace_prefix == 'None': namespace_prefix = '' key = str(metricDict['key']) token = str(metricDict['token']) user = str(metricDict['user']) password = str(metricDict['password']) if metricDict['fetch_resolution_urls'] == 'None': logger.info( 'No fetch_resolution_urls declared for %s, nothing to do' % remote_target) continue if metricDict['fetch_resolution_urls'] == '()' or metricDict[ 'fetch_resolution_urls'] == (): logger.info( 'No fetch_resolution_urls declared for %s, nothing to do' % remote_target) continue fetch_resolution_urls_str = literal_eval( metricDict['fetch_resolution_urls']) fetch_resolution_urls = literal_eval(fetch_resolution_urls_str) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to read from metricData' ) if LOCAL_DEBUG: try: logger.info( 'populate_metric_worker :: remote_target from metricData set to %s' % remote_target) logger.info( 'populate_metric_worker :: metric from metricData set to %s' % metric) logger.info( 'populate_metric_worker :: namespace_prefix from metricData set to %s' % namespace_prefix) logger.info( 'populate_metric_worker :: key from metricData set to %s' % key) logger.info( 'populate_metric_worker :: token from metricData set to %s' % token) logger.info( 'populate_metric_worker :: user from metricData set to %s' % user) logger.info( 'populate_metric_worker :: password from metricData set to %s' % password) logger.info( 'populate_metric_worker :: fetch_resolution_urls from metricData set to %s' % str(fetch_resolution_urls)) if fetch_resolution_urls: for fetch_url in fetch_resolution_urls: logger.info( 'populate_metric_worker :: a fetch_url from metricData is set to %s' % str(fetch_url)) logger.info( 'populate_metric_worker :: metric is set to %s' % metric) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to read from metricData' ) # Best effort to de-duplicate the data sent to Graphite cache_key = 'flux.last.%s' % metric last_flux_timestamp = None try: # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # redis_last_metric_data = self.redis_conn.get(cache_key).decode('utf-8') redis_last_metric_data = self.redis_conn_decoded.get(cache_key) last_metric_data = literal_eval(redis_last_metric_data) last_flux_timestamp = int(last_metric_data[0]) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to determine last_flux_timestamp from Redis key %s' % cache_key) last_flux_timestamp = False recent_last_flux_timestamp_present = False if last_flux_timestamp: now = int(time()) if (now - last_flux_timestamp) < 600: recent_last_flux_timestamp_present = True # Skyline has the metric so adding it to the vista.fetcher # Redis set redis_set = 'vista.fetcher.unique_metrics' data = str(remote_target) try: self.redis_conn.sadd(redis_set, data) logger.info( 'populate_metric_worker :: the last flux update for %s was less than 600 seconds ago, added metric to %s' % (metric, redis_set)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to add %s to Redis set %s' % (str(data), str(redis_set))) # continue if not last_flux_timestamp: # Check Graphite does not have the data or determine what the # last data Graphite has is logger.info( 'populate_metric_worker :: no last_flux_timestamp was found in Redis for %s, checking if Graphite has data' % (metric)) check_graphite_from = [ '-50mins', '-6hours', '-24hours', '-7days', '-30days', '-90days' ] timeseries = [] for graphite_from in check_graphite_from: if last_flux_timestamp: break logger.info( 'populate_metric_worker :: checking %s in Graphite from %s' % (metric, graphite_from)) got_data = False try: # We use absolute time so that if there is a lag in mirage the correct # timeseries data is still surfaced relevant to the anomalous datapoint # timestamp if settings.GRAPHITE_PORT != '': url = '%s://%s:%s/%s/?from=%s&target=%s&format=json' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, str(settings.GRAPHITE_PORT), settings.GRAPHITE_RENDER_URI, graphite_from, metric) else: url = '%s://%s/%s/?from=%s&target=%s&format=json' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, settings.GRAPHITE_RENDER_URI, graphite_from, metric) logger.info( 'populate_metric_worker :: using Graphite URL - %s' % (url)) r = requests.get(url) if r.status_code == 200: js = [] try: js = r.json() except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from Graphite' ) continue if not js: logger.info( 'populate_metric_worker :: %s not present in Graphite from %s' % (metric, graphite_from)) continue got_data = True logger.info( 'populate_metric_worker :: %s present in Graphite from %s' % (metric, graphite_from)) else: logger.info( 'populate_metric_worker :: %s not present in Graphite from %s' % (metric, graphite_from)) continue except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from Graphite' ) continue datapoints = [] if got_data: try: js = r.json() datapoints = js[0]['datapoints'] logger.info( 'populate_metric_worker :: %s data points are present in the Graphite %s data' % (str(len(datapoints)), str(graphite_from))) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from Graphite' ) for datapoint in datapoints: try: value = float(datapoint[0]) timestamp = int(datapoint[1]) new_datapoint = [timestamp, value] timeseries.append(new_datapoint) except: # nosec continue last_timestamp_with_data = None for timestamp, value in timeseries[::-1]: has_value = False if value == 0.0: has_value = True if value == 0: has_value = True if value: has_value = True if has_value: last_timestamp_with_data = int(timestamp) datapoint = value break if last_timestamp_with_data: # Here we set this as the missing last_flux_timestamp last_flux_timestamp = last_timestamp_with_data recent_last_flux_timestamp_present = True logger.info( 'populate_metric_worker :: %s last timestamp in Graphite from %s is %s, using as last_flux_timestamp' % (metric, str(graphite_from), str(last_flux_timestamp))) timeseries = [] start_populating = int(time()) datapoints_added_to_timeseries = 0 datapoints_already_populated = 0 datapoints_with_no_value = 0 timestamp = None value = None # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints # And set flux.last key is the returned value from the remote is # null so that time series that are mostly null do not keep on # getting added to flux populate_metric by Vista raw_timeseries = [] for fetch_url in fetch_resolution_urls: # if recent_last_flux_timestamp_present and remote_host_type == 'prometheus': # This was for the query query and resample method and not for # the query_range query if recent_last_flux_timestamp_present and remote_host_type == 'prometheus_query_range_NOT_FOR_GE_11000': try: logger.info( 'populate_metric_worker :: recent data so replacing fetch_url %s ' % (fetch_url)) seconds_to_fetch = int(time()) - last_flux_timestamp minutes_to_fetch = int(seconds_to_fetch / 60) + 2 re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch) fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch, fetch_url) encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str( minutes_to_fetch) fetch_url = re.sub(r'%5B.*%5D', encoded_re_mins_to_fetch, fetch_url) logger.info( 'populate_metric_worker :: replaced fetch_url %s ' % (fetch_url)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to rewrite URL' ) if recent_last_flux_timestamp_present and remote_host_type == 'prometheus': try: logger.info( 'populate_metric_worker :: recent data so replacing fetch_url %s ' % (fetch_url)) seconds_to_fetch = int(time()) - last_flux_timestamp minutes_to_fetch = int(seconds_to_fetch / 60) + 2 re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch) fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch, fetch_url) encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str( minutes_to_fetch) fetch_url = re.sub(r'%5B.*%5D', encoded_re_mins_to_fetch, fetch_url) logger.info( 'populate_metric_worker :: replaced fetch_url %s ' % (fetch_url)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to rewrite URL' ) success = False try: logger.info( 'populate_metric_worker :: getting data from %s' % str(fetch_url)) response = requests.get(fetch_url) if response.status_code == 200: success = True except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: http status code - %s, reason - %s' % (str(response.status_code), str(response.reason))) logger.error( 'error :: populate_metric_worker :: failed to get data from %s' % str(fetch_url)) if not success: continue datapoints = None try: js = response.json() if remote_host_type == 'graphite': datapoints = js[0]['datapoints'] if remote_host_type == 'prometheus': datapoints = js['data']['result'][0]['values'] datapoints_fetched = len(datapoints) logger.info( 'populate_metric_worker :: retrieved %s data points from %s' % (str(datapoints_fetched), str(fetch_url))) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from %s' % str(fetch_url)) # Example # datapoints[0] # [7.3, 1556817000] # Add each data point and timestamp to the timeseries list so # they can be sent to Graphite if not datapoints: logger.info( 'populate_metric_worker :: failed to get any data from %s' % str(fetch_url)) continue # @added 20191108 - Bug #3312: flux - populate_metric_worker - handle None in datapoints valid_datapoints = [] for datapoint in datapoints: value = None timestamp = None if remote_host_type == 'graphite': # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints raw_timeseries.append([datapoint[1], datapoint[0]]) try: raw_value = datapoint[0] if raw_value is None: datapoints_with_no_value += 1 continue value = float(datapoint[0]) timestamp = int(datapoint[1]) valid_datapoints.append([value, timestamp]) except: continue if remote_host_type == 'prometheus': # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints raw_timeseries.append([datapoint[0], datapoint[1]]) try: raw_value = datapoint[1] if raw_value is None: datapoints_with_no_value += 1 continue timestamp = int(datapoint[0]) value = float(datapoint[1]) except: continue valid_datapoints.append([timestamp, value]) datapoints = valid_datapoints # Order the time series by timestamp as the tuple can shift # order resulting in more recent data being added before older # data datapoints.sort() # Determine the timestamp of the current minute to apply # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE time_now = int(time()) current_minute_hour = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%H')) current_minute_minute = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%M')) current_datetime = datetime.datetime.utcfromtimestamp( time_now).replace(hour=current_minute_hour, minute=current_minute_minute, second=0, microsecond=0) current_minute_timestamp_start = int( current_datetime.strftime('%s')) datapoints_in_current_minute = 0 last_error = None value = None timestamp = None for datapoint in datapoints: try: if remote_host_type == 'graphite': try: raw_value = datapoint[0] if raw_value is None: continue value = float(datapoint[0]) timestamp = int(datapoint[1]) except: continue if remote_host_type == 'prometheus': # timestamp = int(datapoint[0]) try: timestamp = int(datapoint[0]) value = float(datapoint[1]) except: continue submit_data = True if last_flux_timestamp: if timestamp <= last_flux_timestamp: submit_data = False datapoints_already_populated += 1 # Here if the timestamp of the data point falls # within the current minute, it is discarded and not # sent to flux, to ensure that high frequency metrics # can have their minutely bins fully populated before # they are submitted to Graphite if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE: if timestamp >= current_minute_timestamp_start: submit_data = False datapoints_in_current_minute += 1 if submit_data: new_datapoint = [timestamp, value] timeseries.append(new_datapoint) datapoints_added_to_timeseries += 1 # nosec to exclude from bandit tests except: # nosec last_error = traceback.format_exc() datapoints_with_no_value += 1 continue if last_error: logger.error(last_error) logger.error( 'error :: populate_metric_worker :: the above is the last_error encountered processing %s' % (str(metric))) if datapoints_with_no_value: logger.info( 'populate_metric_worker :: %s of the fetched records were discarded as they had value None' % (str(datapoints_with_no_value))) if datapoints_in_current_minute: logger.info( 'populate_metric_worker :: %s of the fetched records were discarded as they fall within the current minute' % (str(datapoints_in_current_minute))) logger.info( 'populate_metric_worker :: %s of the fetched data points are older than the last known flux timestamp' % (str(datapoints_already_populated))) logger.info( 'populate_metric_worker :: added %s data points to the time series to submit to Graphite' % (str(datapoints_added_to_timeseries))) end_fecthing = int(time()) seconds_to_fetch = end_fecthing - start_populating if timestamp: logger.info( 'populate_metric_worker :: last fetched value - %s, timestamp %s' % (str(value), str(timestamp))) logger.info( 'populate_metric_worker :: %s data point fecthed for %s in %s seconds' % (str(datapoints_added_to_timeseries), remote_target, str(seconds_to_fetch))) # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints # And set flux.last key is the returned value from the remote is # null so that time series that are mostly null do not keep on # getting added to flux populate_metric by Vista if not timeseries: set_flux_key = False try: sorted_raw_timeseries = sorted(raw_timeseries, key=lambda x: x[0]) last_ts = sorted_raw_timeseries[-1][0] if int(last_ts) > (end_fecthing - 120): if sorted_raw_timeseries[-1][1] is None: set_flux_key = True except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to determine if last value was null' ) if set_flux_key: try: # Update Redis flux key cache_key = 'flux.last.%s' % metric metric_data = [int(last_ts), None] self.redis_conn.set(cache_key, str(metric_data)) logger.info( 'populate_metric_worker :: even though no data points so as to not loop round on this metric, set the metric Redis key - %s - %s' % (cache_key, str(metric_data))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: even though no data points, failed to set Redis key - %s - %s' % (cache_key, str(metric_data))) # Adding to the vista.fetcher.unique_metrics Redis set redis_set = 'vista.fetcher.unique_metrics' data = str(remote_target) try: self.redis_conn.sadd(redis_set, data) logger.info( 'populate_metric_worker :: even though no data points, added %s to Redis set %s' % (remote_target, redis_set)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: even though no data points, failed to add %s to Redis set %s' % (str(data), str(redis_set))) if not timeseries: logger.info( 'populate_metric_worker :: no data in the timeseries list for the time series for %s' % metric) continue # Order the time series by timestamp as the tuple can shift # order resulting in more recent data being added before older # data timeseries.sort() timeseries_length = len(timeseries) # Resample resample_at = '1Min' if resample_at: try: df = pd.DataFrame(timeseries) df.columns = ['timestamp', 'value'] df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', origin='unix') df = df.set_index('timestamp') # resampled_df = df.resample(resample_at).sum() # Use the mean as Prometheus uses the average in the # query_range API method resampled_df = df.resample(resample_at).mean() resampled_timeseries = [] for index, row in resampled_df.iterrows(): timestamp = int(index.strftime('%s')) resampled_timeseries.append([timestamp, row[0]]) timeseries = resampled_timeseries timeseries_length = len(timeseries) logger.info( 'populate_metric_worker :: time series resampled at %s resulting in %s data points to send to Graphite' % (str(resample_at), str(timeseries_length))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to resample time series for %s' % str(metric)) logger.info( 'populate_metric_worker :: %s data points to send to Graphite' % (str(timeseries_length))) timestamp = None value = None sent_to_graphite = 0 # use_pickle = False use_pickle = True if not use_pickle: for timestamp, value in timeseries: try: graphyte.send(metric, float(value), int(timestamp)) sent_to_graphite += 1 if sent_to_graphite % 1000 == 0: logger.info( 'populate_metric_worker :: submitted %s of %s data points to Graphite so far' % (str(sent_to_graphite), str(timeseries_length))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to send metric data to Graphite for %s' % str(metric)) else: listOfMetricTuples = [] try: for timestamp, value in timeseries: tuple_data = (metric, (int(timestamp), float(value))) listOfMetricTuples.append(tuple_data) sent_to_graphite += 1 except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to populate listOfMetricTuples for %s' % str(metric)) if listOfMetricTuples: data_points_sent = 0 smallListOfMetricTuples = [] tuples_added = 0 for data in listOfMetricTuples: smallListOfMetricTuples.append(data) tuples_added += 1 if tuples_added >= 1000: pickle_data_sent = pickle_data_to_graphite( smallListOfMetricTuples) if pickle_data_sent: data_points_sent += tuples_added logger.info( 'populate_metric_worker :: sent %s/%s of %s data points to Graphite via pickle for %s' % (str(tuples_added), str(data_points_sent), str(timeseries_length), metric)) sent_to_graphite += len( smallListOfMetricTuples) smallListOfMetricTuples = [] tuples_added = 0 else: logger.error( 'error :: populate_metric_worker :: failed to send %s data points to Graphite via pickle for %s' % (str(tuples_added), metric)) if smallListOfMetricTuples: tuples_to_send = len(smallListOfMetricTuples) pickle_data_sent = pickle_data_to_graphite( smallListOfMetricTuples) if pickle_data_sent: data_points_sent += tuples_to_send logger.info( 'populate_metric_worker :: sent the last %s/%s of %s data points to Graphite via pickle for %s' % (str(tuples_to_send), str(data_points_sent), str(timeseries_length), metric)) else: logger.error( 'error :: populate_metric_worker :: failed to send the last %s data points to Graphite via pickle for %s' % (str(tuples_to_send), metric)) logger.info( 'populate_metric_worker :: sent %s data points to Graphite for %s' % (str(sent_to_graphite), metric)) try: skyline_metric = '%s.datapoints_sent_to_graphite' % ( skyline_app_graphite_namespace) # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, float(sent_to_graphite), int(time())) send_graphite_metric(skyline_app, skyline_metric, float(sent_to_graphite)) logger.info( 'populate_metric_worker :: submitted %s to Graphite for %s' % (str(float(sent_to_graphite)), skyline_metric)) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to send metric data to Graphite for %s' % str(skyline_metric)) has_value = False if value == 0.0: has_value = True if value == 0: has_value = True if value: has_value = True if timestamp and has_value: try: # Update Redis flux key cache_key = 'flux.last.%s' % metric metric_data = [int(timestamp), float(value)] self.redis_conn.set(cache_key, str(metric_data)) logger.info( 'populate_metric_worker :: set the metric Redis key - %s - %s' % (cache_key, str(metric_data))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to set Redis key - %s - %s' % (cache_key, str(metric_data))) # Adding to the vista.fetcher.unique_metrics Redis set redis_set = 'vista.fetcher.unique_metrics' data = str(remote_target) try: self.redis_conn.sadd(redis_set, data) logger.info( 'populate_metric_worker :: added %s to Redis set %s' % (remote_target, redis_set)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to add %s to Redis set %s' % (str(data), str(redis_set))) end_populating = int(time()) seconds_to_run = end_populating - start_populating logger.info( 'populate_metric_worker :: %s populated to Graphite in %s seconds' % (metric, str(seconds_to_run)))
def alert_smtp(datapoint, metric_name, expiration_time, metric_trigger, algorithm): sender = settings.BOUNDARY_SMTP_OPTS['sender'] matched_namespaces = [] for namespace in settings.BOUNDARY_SMTP_OPTS['recipients']: CHECK_MATCH_PATTERN = namespace check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(metric_name) if pattern_match: matched_namespaces.append(namespace) matched_recipients = [] for namespace in matched_namespaces: for recipients in settings.BOUNDARY_SMTP_OPTS['recipients'][namespace]: matched_recipients.append(recipients) def unique_noHash(seq): seen = set() return [x for x in seq if str(x) not in seen and not seen.add(str(x))] recipients = unique_noHash(matched_recipients) # Backwards compatibility if type(recipients) is str: recipients = [recipients] # @added 20180524 - Task #2384: Change alerters to cc other recipients # The alerters did send an individual email to each recipient. This would be # more useful if one email was sent with the first smtp recipient being the # to recipient and the subsequent recipients were add in cc. primary_recipient = False cc_recipients = False if recipients: for i_recipient in recipients: if not primary_recipient: primary_recipient = str(i_recipient) if primary_recipient != i_recipient: if not cc_recipients: cc_recipients = str(i_recipient) else: new_cc_recipients = '%s,%s' % (str(cc_recipients), str(i_recipient)) cc_recipients = str(new_cc_recipients) logger.info( 'alert_smtp - will send to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) alert_algo = str(algorithm) alert_context = alert_algo.upper() # @added 20191008 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings try: main_alert_title = settings.CUSTOM_ALERT_OPTS['main_alert_title'] except: main_alert_title = 'Skyline' try: app_alert_context = settings.CUSTOM_ALERT_OPTS[ 'boundary_alert_heading'] except: app_alert_context = 'Boundary' # @modified 20191002 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings # Use alert_context # unencoded_graph_title = 'Skyline Boundary - %s at %s hours - %s - %s' % ( # alert_context, graphite_previous_hours, metric_name, datapoint) unencoded_graph_title = '%s %s - %s at %s hours - %s - %s' % ( main_alert_title, app_alert_context, alert_context, graphite_previous_hours, metric_name, datapoint) # @added 20181126 - Task #2742: Update Boundary # Feature #2034: analyse_derivatives # Added deriative functions to convert the values of metrics strictly # increasing monotonically to their deriative products in alert graphs and # specify it in the graph_title known_derivative_metric = False try: # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn_decoded # if settings.REDIS_PASSWORD: # # @modified 20191022 - Bug #3266: py3 Redis binary objects not strings # # Branch #3262: py3 # # REDIS_ALERTER_CONN = redis.StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # REDIS_ALERTER_CONN = redis.StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True) # else: # # REDIS_ALERTER_CONN = redis.StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # REDIS_ALERTER_CONN = redis.StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True) REDIS_ALERTER_CONN = get_redis_conn_decoded(skyline_app) except: logger.error('error :: alert_smtp - redis connection failed') # @modified 20191022 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 try: derivative_metrics = list( REDIS_ALERTER_CONN.smembers('derivative_metrics')) except: derivative_metrics = [] redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(metric_name)) if redis_metric_name in derivative_metrics: known_derivative_metric = True if known_derivative_metric: try: non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS except: non_derivative_monotonic_metrics = [] skip_derivative = in_list(redis_metric_name, non_derivative_monotonic_metrics) if skip_derivative: known_derivative_metric = False known_derivative_metric = is_derivative_metric(skyline_app, metric_name) if known_derivative_metric: # @modified 20191002 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings # unencoded_graph_title = 'Skyline Boundary - %s at %s hours - derivative graph - %s - %s' % ( # alert_context, graphite_previous_hours, metric_name, datapoint) unencoded_graph_title = '%s %s - %s at %s hours - derivative graph - %s - %s' % ( main_alert_title, app_alert_context, alert_context, graphite_previous_hours, metric_name, datapoint) graph_title_string = quote(unencoded_graph_title, safe='') graph_title = '&title=%s' % graph_title_string # @added 20181126 - Bug #2498: Incorrect scale in some graphs # Task #2742: Update Boundary # If -xhours is used the scale is incorrect if x hours > than first # retention period, passing from and until renders the graph with the # correct scale. graphite_port = '80' if settings.GRAPHITE_PORT != '': graphite_port = str(settings.GRAPHITE_PORT) until_timestamp = int(time()) from_seconds_ago = graphite_previous_hours * 3600 from_timestamp = until_timestamp - from_seconds_ago graphite_from = dt.datetime.fromtimestamp( int(from_timestamp)).strftime('%H:%M_%Y%m%d') logger.info('graphite_from - %s' % str(graphite_from)) graphite_until = dt.datetime.fromtimestamp( int(until_timestamp)).strftime('%H:%M_%Y%m%d') logger.info('graphite_until - %s' % str(graphite_until)) # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle # graphite_target = 'target=cactiStyle(%s)' graphite_target = 'target=cactiStyle(%s,%%27si%%27)' % metric_name if known_derivative_metric: # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle # graphite_target = 'target=cactiStyle(nonNegativeDerivative(%s))' graphite_target = 'target=cactiStyle(nonNegativeDerivative(%s),%%27si%%27)' % metric_name # @modified 20190520 - Branch #3002: docker # Use GRAPHITE_RENDER_URI # link = '%s://%s:%s/render/?from=%s&until=%s&%s%s%s&colorList=%s' % ( # settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port, # str(graphite_from), str(graphite_until), graphite_target, # settings.GRAPHITE_GRAPH_SETTINGS, graph_title, # graphite_graph_line_color) link = '%s://%s:%s/%s/?from=%s&until=%s&%s%s%s&colorList=%s' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port, settings.GRAPHITE_RENDER_URI, str(graphite_from), str(graphite_until), graphite_target, settings.GRAPHITE_GRAPH_SETTINGS, graph_title, graphite_graph_line_color) content_id = metric_name image_data = None image_file = '%s/%s.%s.%s.alert_smtp.png' % ( settings.SKYLINE_TMP_DIR, skyline_app, str(until_timestamp), metric_name) if settings.BOUNDARY_SMTP_OPTS.get('embed-images'): image_data = get_graphite_graph_image(skyline_app, link, image_file) if settings.BOUNDARY_SMTP_OPTS.get('embed-images_disabled3290'): # @modified 20191021 - Task #3290: Handle urllib2 in py3 # Branch #3262: py3 if python_version == 2: try: # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests # image_data = urllib2.urlopen(link).read() # nosec image_data = None except urllib2.URLError: image_data = None if python_version == 3: try: # image_data = urllib.request.urlopen(link).read() # nosec image_data = None except: logger.error(traceback.format_exc()) logger.error( 'error :: boundary_alerters :: alert_smtp :: failed to urlopen %s' % str(link)) image_data = None # If we failed to get the image or if it was explicitly disabled, # use the image URL instead of the content. if image_data is None: img_tag = '<img src="%s"/>' % link else: img_tag = '<img src="cid:%s"/>' % content_id # @modified 20191002 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings # body = '%s :: %s <br> Next alert in: %s seconds <br> skyline Boundary alert - %s <br><a href="%s">%s</a>' % ( # datapoint, metric_name, expiration_time, alert_context, link, img_tag) body = '%s :: %s <br> Next alert in: %s seconds <br> %s %s alert - %s <br><a href="%s">%s</a>' % ( main_alert_title, app_alert_context, datapoint, metric_name, expiration_time, alert_context, link, img_tag) # @modified 20180524 - Task #2384: Change alerters to cc other recipients # Do not send to each recipient, send to primary_recipient and cc the other # recipients, thereby sending only one email # for recipient in recipients: if primary_recipient: logger.info( 'alert_smtp - will send to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) msg = MIMEMultipart('alternative') # @modified 20191002 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings # msg['Subject'] = '[Skyline alert] ' + 'Boundary ALERT - ' + alert_context + ' - ' + datapoint + ' - ' + metric_name msg['Subject'] = '[' + main_alert_title + ' alert] ' + app_alert_context + ' ALERT - ' + alert_context + ' - ' + datapoint + ' - ' + metric_name msg['From'] = sender # @modified 20180524 - Task #2384: Change alerters to cc other recipients # msg['To'] = recipient msg['To'] = primary_recipient # @added 20180524 - Task #2384: Change alerters to cc other recipients # Added Cc if cc_recipients: msg['Cc'] = cc_recipients msg.attach(MIMEText(body, 'html')) if image_data is not None: # msg_attachment = MIMEImage(image_data) fp = open(image_file, 'rb') msg_attachment = MIMEImage(fp.read()) fp.close() msg_attachment.add_header('Content-ID', '<%s>' % content_id) msg.attach(msg_attachment) s = SMTP('127.0.0.1') # @modified 20180524 - Task #2384: Change alerters to cc other recipients # Send to primary_recipient and cc_recipients # s.sendmail(sender, recipient, msg.as_string()) try: if cc_recipients: s.sendmail(sender, [primary_recipient, cc_recipients], msg.as_string()) else: s.sendmail(sender, primary_recipient, msg.as_string()) except: logger.error(traceback.format_exc()) logger.error( 'error :: alert_smtp - could not send email to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) s.quit()
def run(self): """ - Called when the process intializes. - Determine if Redis is up - Spawn a rolling process to do checks - Wait for the process to finish. - run_every 60 seconds """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('thunder/rolling :: starting %s/rolling' % skyline_app) try: SERVER_METRIC_PATH = '.%s' % settings.SERVER_METRICS_NAME if SERVER_METRIC_PATH == '.': SERVER_METRIC_PATH = '' except Exception as e: SERVER_METRIC_PATH = '' logger.warning( 'warning :: thunder/rolling :: settings.SERVER_METRICS_NAME is not declared in settings.py, defaults to \'\' - %s' % e) run_every = 60 while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: thunder/rolling cannot connect to redis at socket path %s - %s' % (settings.REDIS_SOCKET_PATH, e)) sleep(10) try: self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) except Exception as e: logger.info(traceback.format_exc()) logger.error( 'error :: thunder/rolling cannot connect to get_redis_conn - %s' % e) continue # Report app up try: self.redis_conn.setex('thunder.rolling', 120, now) except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: thunder/rolling :: could not update the Redis analyzer.thunder/rolling key - %s' % e) # Spawn processes pids = [] spawned_pids = [] pid_count = 0 try: p = Process(target=self.rolling_process, args=(0, )) pids.append(p) pid_count += 1 logger.info('thunder/rolling :: starting rolling_process') p.start() spawned_pids.append(p.pid) except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: thunder/rolling :: failed to spawn process - %s' % e) # Self monitor processes and terminate if any rolling_process that # has run for longer than 180 seconds p_starts = time() while time() - p_starts <= run_every: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info( 'thunder/rolling :: rolling_process completed in %.2f seconds' % (time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info( 'thunder/rolling :: timed out, killing rolling_process process' ) for p in pids: logger.info( 'thunder/rolling :: killing rolling_process process') p.terminate() logger.info( 'thunder/rolling :: killed rolling_process process') for p in pids: if p.is_alive(): try: logger.info( 'thunder/rolling :: stopping rolling_process - %s' % (str(p.is_alive()))) p.terminate() except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: thunder/rolling :: failed to stop rolling_process - %s' % e) process_runtime = time() - now if process_runtime < run_every: sleep_for = (run_every - process_runtime) process_runtime_now = time() - now sleep_for = (run_every - process_runtime_now) logger.info( 'thunder/rolling :: sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for) try: del sleep_for except Exception as e: logger.error( 'error :: thunder/rolling :: failed to del sleep_for - %s' % e) try: del process_runtime except Exception as e: logger.error( 'error :: thunder/rolling :: failed to del process_runtime - %s' % e)
def prune_metrics_timestamp_hash_key(current_skyline_app, hash_key, older_than_timestamp, log=True): """ Remove any entries from a metrics timestamp hash key older than the timestamp passed. :param current_skyline_app: the app calling the function :param hash_key: the metric:timestamp style Redis hash key :param older_than_timestamp: the unix timestamp :param log: whether to log or not, optional, defaults to True :type current_skyline_app: str :type hash_key: str :type timestamp: int :type log: boolean :return: removed_count :rtype: int """ removed_from_hash = 0 function_str = 'metrics_manager :: functions.redis.prune_metrics_timestamp_hash_key' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None if log: current_logger.info( '%s :: pruning entries older than %s from Redis hash key %s' % (function_str, str(older_than_timestamp), hash_key)) try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis connection - %s' % (function_str, e)) return removed_from_hash metrics_dict = {} try: metrics_dict = redis_conn_decoded.hgetall(hash_key) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis hash key %s - %s' % (function_str, hash_key, e)) if not metrics_dict: return removed_from_hash # Remove entries older_than_timestamp metrics = list(metrics_dict.keys()) for metric in metrics: try: timestamp = float(metrics_dict[metric]) if int(timestamp) < older_than_timestamp: try: redis_conn_decoded.hdel(hash_key, metric) removed_from_hash += 1 except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to del %s from Redis hash key %s - %s' % (function_str, metric, hash_key, e)) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: manage %s from Redis hash key %s, breaking out of loop - %s' % (function_str, metric, hash_key, e)) break if log: current_logger.info( '%s :: removed %s old entries from Redis hash key %s' % (function_str, str(removed_from_hash), hash_key)) return removed_from_hash
def get_top_level_namespaces(current_skyline_app, log=False): """ Determine all top level parent namespaces and return the list. :param current_skyline_app: the app calling the function :param log: whether to log or not, optional, defaults to False :type current_skyline_app: str :type log: boolean :return: top_level_namespaces :rtype: list """ top_level_namespaces = [] function_str = 'functions.metrics.get_top_level_namespaces' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.info('%s :: %s :: determining top level namespaces' % (current_skyline_app, function_str)) else: current_logger = None try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: get_redis_conn_decoded failed - %s' % (current_skyline_app, function_str, e)) return top_level_namespaces unique_base_names = [] redis_key = 'aet.analyzer.unique_base_names' try: unique_base_names = list(redis_conn_decoded.smembers(redis_key)) if unique_base_names: if log: current_logger.info('%s :: %s :: got %s unique_base_names' % (current_skyline_app, function_str, str(len(unique_base_names)))) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: %s :: failed to get Redis key %s - %s' % (current_skyline_app, function_str, redis_key, e)) for base_name in unique_base_names: top_level_namespace = base_name.split('.')[0] if top_level_namespace: top_level_namespaces.append(top_level_namespace) if top_level_namespaces: top_level_namespaces = list(set(top_level_namespaces)) if log: current_logger.info( '%s :: %s :: returning %s top level namespaces' % (current_skyline_app, function_str, str(len(top_level_namespaces)))) return top_level_namespaces
def run(self): """ Called when the process intializes. """ logger.info('worker :: starting worker') last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error( 'worker :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) if LOCAL_DEBUG: try: metric_data_queue_size = self.q.qsize() logger.info( 'worker :: debug :: flux.httpMetricDataQueue queue size - %s' % str(metric_data_queue_size)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue' ) metric_data = None try: # Get a metric from the queue with a 1 second timeout, each # metric item on the queue is a list e.g. # metric_data = [metricName, metricValue, metricTimestamp] metric_data = self.q.get(True, 1) except Empty: logger.info('worker :: queue is empty and timed out') sleep(1) except NotImplementedError: pass except KeyboardInterrupt: logger.info( 'worker :: server has been issued a user signal to terminate - KeyboardInterrupt' ) except SystemExit: logger.info('worker :: server was interrupted - SystemExit') except Exception as e: logger.error('error :: worker :: %s' % (str(e))) # @added 20200206 - Feature #3444: Allow flux to backfill # Added backfill backfill = False if metric_data: try: metric = str(metric_data[0]) value = float(metric_data[1]) timestamp = int(metric_data[2]) # @added 20200206 - Feature #3444: Allow flux to backfill # Added backfill backfill = int(metric_data[3]) if LOCAL_DEBUG: logger.info( 'worker :: debug :: queue item found - %s' % str(metric_data)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s' % str(metric_data)) continue if settings.FLUX_SEND_TO_CARBON: # Best effort de-duplicate the data valid_data = True # @added 20200818 - Feature #3694: flux - POST multiple metrics # Handle Redis and literal_eval separately redis_last_metric_data = None # @modified 20200206 - Feature #3444: Allow flux to backfill # Only check flux.last key if this is not backfill if not backfill: cache_key = 'flux.last.%s' % metric last_metric_timestamp = None try: # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # redis_last_metric_data = self.redis_conn.get(cache_key) redis_last_metric_data = self.redis_conn_decoded.get( cache_key) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine last_metric_timestamp from Redis key %s' % str(cache_key)) redis_last_metric_data = None # @modified 20200818 - Feature #3694: flux - POST multiple metrics # Handle Redis and literal_eval separately, only # literal_eval if Redis had data for the key if redis_last_metric_data: try: last_metric_data = literal_eval( redis_last_metric_data) last_metric_timestamp = int( last_metric_data[0]) if LOCAL_DEBUG: logger.info( 'worker :: debug :: last_metric_timestamp for %s from %s is %s' % (metric, str(cache_key), str(last_metric_timestamp))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine last_metric_timestamp from Redis key %s' % str(cache_key)) last_metric_timestamp = False if last_metric_timestamp: if timestamp <= last_metric_timestamp: valid_data = False if LOCAL_DEBUG: logger.info( 'worker :: debug :: not valid data - the queue data timestamp %s is <= to the last_metric_timestamp %s for %s' % (str(timestamp), str(last_metric_timestamp), metric)) if valid_data: submittedToGraphite = False try: graphyte.send(metric, value, timestamp) submittedToGraphite = True logger.info( 'worker :: sent %s, %s, %s to Graphite' % (str(metric), str(value), str(timestamp))) metrics_sent_to_graphite += 1 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send metric data to Graphite for %s' % str(metric)) metric = None if submittedToGraphite: # Update the metric Redis flux key # @modified 20200206 - Feature #3444: Allow flux to backfill # Only update the flux.last key if this is not backfill if not backfill: metric_data = [timestamp, value] self.redis_conn.set(cache_key, str(metric_data)) # @added 20200213 - Bug #3448: Repeated airgapped_metrics else: # @added 20200213 - Bug #3448: Repeated airgapped_metrics # Add a flux.filled key to Redis with a expiry # set to FULL_DURATION so that Analyzer knows to # sort and deduplicate the Redis time series # data as carbon-relay will send it to Horizon # and the datapoints will be out of order in the # Redis key try: flux_filled_key = 'flux.filled.%s' % str( metric) self.redis_conn.setex( flux_filled_key, settings.FULL_DURATION, int(time())) logger.info('worker :: set Redis key %s' % (str(flux_filled_key))) except Exception as e: logger.error( 'error :: failed to could not set Redis flux.filled key: %s' % e) else: logger.info( 'worker :: discarded %s, %s, %s as a data point for %s has already been submitted to Graphite' % (str(metric), str(value), str(timestamp), str(timestamp))) else: logger.info( 'worker :: settings.FLUX_SEND_TO_CARBON is set to %s, discarded %s, %s, %s' % (str(settings.FLUX_SEND_TO_CARBON), str(metric), str(value), str(timestamp))) if settings.FLUX_SEND_TO_STATSD: statsd_conn.incr(metric, value, timestamp) logger.info('worker sent %s, %s, %s to statsd' % (metric, str(value), str(timestamp))) time_now = int(time()) if (time_now - last_sent_to_graphite) >= 60: logger.info( 'worker :: metrics_sent_to_graphite in last 60 seconds - %s' % str(metrics_sent_to_graphite)) skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace try: # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now) send_graphite_metric(skyline_app, skyline_metric, metrics_sent_to_graphite) last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite)))
def get_metrics_timeseries( current_skyline_app, metrics_functions, from_timestamp, until_timestamp, log=True): """ Return dictionary of metrics with their timeseries as a list e.g. metrics_timeseries = { 'metric.1': { 'timeseries': [[ts, value], [ts, value], ..., [ts, value]], 'functions': 'nonNegativeDerivative', }, 'metric.2': { 'timeseries': [[ts, value], [ts, value], ..., [ts, value]], 'functions': None, }, 'metric.3': { 'timeseries': [[ts, value], [ts, value], ..., [ts, value]], 'functions': {'summarise': {'intervalString': '10min', 'func': 'sum'}, 'integral': None}, }, } The metrics_functions parameter dictionary allows for metrics and any functions to be applied to be specified e.g. metrics_functions = { 'metric.1': { 'functions': None, }, 'metric.2': { 'functions': None, }, 'metric.3': { 'functions': {'integral': None, 'summarize': {'intervalString': '10min', 'func': 'sum'}}, }, } Each metric can have one or multiple functions parsed for it using the functions key in the dictionary item. There is NO NEED to ever pass the nonNegativeDerivative as the function uses the normal derivative_metrics information to do that. functions are applied in the order in which they are passed e.g. target=integral(summarize(metric.3,"10min")) function parameters can be passed with the function as well or declared as None if there are no parameters required with the function. :param current_skyline_app: the app calling the function :param metrics_functions: the metric base_names and any functions to apply :param from_timestamp: the from unix timestamp :param until_timestamp: the until unix timestamp :param log: whether to log or not, optional, defaults to True :type current_skyline_app: str :type metrics_functions: dict :type log: boolean :return: dictionary of metric timeseries :rtype: dict """ metrics_timeseries = {} function_str = '%s :: functions.graphite.get_metrics_timeseries' % current_skyline_app if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None # graphite URL graphite_port = '80' if settings.GRAPHITE_PORT != '': graphite_port = str(settings.GRAPHITE_PORT) if settings.GRAPHITE_PORT == '443' and settings.GRAPHITE_PROTOCOL == 'https': graphite_port = '' graphite_url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + graphite_port + '/' + settings.GRAPHITE_RENDER_URI + '?from=' + str(from_timestamp) + '&until=' + str(until_timestamp) + '&format=json' connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT) read_timeout = int(settings.GRAPHITE_READ_TIMEOUT) read_timeout = 30 use_timeout = (int(connect_timeout), int(read_timeout)) try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error('error :: %s :: failed to connect to Redis - %s' % ( function_str, e)) derivative_metrics = [] try: # @modified 20211012 - Feature #4280: aet.metrics_manager.derivative_metrics Redis hash # derivative_metrics = list(redis_conn_decoded.smembers('derivative_metrics')) derivative_metrics = list(redis_conn_decoded.smembers('aet.metrics_manager.derivative_metrics')) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error('error :: %s :: failed to connect to Redis for smembers of derivative_metrics - %s' % ( function_str, e)) derivative_metrics = [] # Add nonNegativeDerivative tranform to derivative_metrics and then fetch # from Graphite in batches of MAX_GRAPHITE_TARGETS get_metrics_with_functions = {} for base_name in list(metrics_functions.keys()): redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, base_name) if redis_metric_name in derivative_metrics: get_metrics_with_functions[base_name] = metrics_functions[base_name] original_functions = metrics_functions[base_name]['functions'] if original_functions is not None: functions = {} functions['nonNegativeDerivative'] = None for function in list(original_functions.keys()): functions[function] = original_functions[function] else: functions = {'nonNegativeDerivative': None} get_metrics_with_functions[base_name]['functions'] = functions else: get_metrics_with_functions[base_name] = metrics_functions[base_name] metrics_list = list(get_metrics_with_functions.keys()) headers = {'Content-Type': 'application/x-www-form-urlencoded'} if metrics_list: metrics_to_get = [] while len(metrics_list) > 0: metrics_to_get = [] post_content = 'format=json&from=%s&until=%s' % (str(from_timestamp,), str(until_timestamp)) for index, metric in enumerate(metrics_list): if len(metrics_to_get) < MAX_GRAPHITE_TARGETS: metrics_to_get.append(metric) metrics_list.pop(index) else: break for base_name in metrics_to_get: functions = get_metrics_with_functions[base_name]['functions'] target = base_name if functions is not None: for function in list(functions.keys()): function_arguments = functions[function] if function_arguments is None: target = '%s(%s)' % (function, target) if isinstance(function_arguments, int): target = '%s(%s,%s)' % (function, target, function_arguments) if isinstance(function_arguments, str): target = '%s(%s,"%s")' % (function, target, function_arguments) if isinstance(function_arguments, dict): target = '%s(%s' % (function, target) for function_parmeter in list(function_arguments.keys()): function_parmeter_value = function_arguments[function_parmeter] if function_parmeter_value is None: target = str(target) if isinstance(function_parmeter_value, int): target = '%s,%s' % (target, function_parmeter_value) if isinstance(function_parmeter_value, str): target = '%s,"%s"' % (target, function_parmeter_value) target = '%s)' % target get_metrics_with_functions[base_name]['target'] = target post_content = '%s&target=%s' % (post_content, target) graphite_json_fetched = False try: r = requests.post(graphite_url, data=post_content, headers=headers, timeout=use_timeout) graphite_json_fetched = True except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error('error :: %s :: data retrieval from Graphite failed - %s' % ( function_str, e)) js = {} if graphite_json_fetched: try: js = r.json() except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error('error :: %s :: failed to parse retrieved json - %s' % ( function_str, e)) for item in js: data_error = None timeseries = None base_name = None try: target = item['target'] for metric_base_name in metrics_to_get: if metric_base_name in target: base_name = metric_base_name if not base_name: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error('error :: %s :: failed to determine base_name from get_metrics_with_functions[metric_base_name] with target: %s' % ( function_str, str(target))) continue datapoints = item['datapoints'] converted = [] for datapoint in datapoints: try: new_datapoint = [int(datapoint[1]), float(datapoint[0])] converted.append(new_datapoint) except Exception as e: data_error = e timeseries = converted except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error('error :: %s :: failed to parse data points from retrieved json data_error: %s - %s' % ( function_str, str(data_error), e)) if base_name: metrics_timeseries[base_name] = {} metrics_timeseries[base_name]['functions'] = get_metrics_with_functions[base_name]['functions'] metrics_timeseries[base_name]['timeseries'] = None if timeseries: metrics_timeseries[base_name]['timeseries'] = timeseries return metrics_timeseries
def thunder_stale_metrics(current_skyline_app, log=True): """ Determine stale metrics in each top level namespace. :param current_skyline_app: the app calling the function :param log: whether to log or not, optional, defaults to True :type current_skyline_app: str :type log: boolean :return: (namespace_stale_metrics_dict, namespace_recovered_metrics_dict) :rtype: tuple """ if current_skyline_app == 'analyzer': function_str = 'metrics_manager :: functions.thunder.thunder_stale_metrics' if current_skyline_app == 'webapp': function_str = 'functions.thunder.thunder_stale_metrics' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None def get_sparsity(base_name): """ Determine the metric sparsity """ success = True sparsity = None timeseries = [] try: timeseries = get_metric_timeseries(current_skyline_app, base_name) except Exception as e: success = e sparsity = None if timeseries: try: sparsity = determine_data_sparsity(current_skyline_app, timeseries, None, False) except Exception as e: success = e sparsity = None else: success = 'no timeseries data' sparsity = None return success, sparsity now = int(time()) namespace_stale_metrics_dict = {} namespace_recovered_metrics_dict = {} alerted_on_stale_metrics_dict = {} metrics_last_timestamp_dict = {} hash_key = 'analyzer.metrics.last_timeseries_timestamp' try: redis_conn_decoded = get_redis_conn_decoded(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis connection - %s' % (function_str, e)) return namespace_stale_metrics_dict try: metrics_last_timestamp_dict = redis_conn_decoded.hgetall(hash_key) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis hash key %s - %s' % (function_str, hash_key, e)) if not metrics_last_timestamp_dict: return namespace_stale_metrics_dict # Do not send stale alerts for any identified sparsely populated metrics metrics_sparsity_dict = {} data_sparsity_hash_key = 'analyzer.metrics_manager.hash_key.metrics_data_sparsity' try: metrics_sparsity_dict = redis_conn_decoded.hgetall( data_sparsity_hash_key) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis hash key %s - %s' % (function_str, data_sparsity_hash_key, e)) sparsely_populated_metrics = [] metrics_of_known_sparsity = [] base_names_of_known_sparsity = [] if metrics_sparsity_dict: metrics_of_known_sparsity = list(metrics_sparsity_dict.keys()) for metric_name in metrics_of_known_sparsity: metric_name = str(metric_name) if metric_name.startswith(settings.FULL_NAMESPACE): base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) else: base_name = metric_name base_names_of_known_sparsity.append(base_name) sparsity = metrics_sparsity_dict[metric_name] if float(sparsity) < settings.SPARSELY_POPULATED_PERCENTAGE: sparsely_populated_metrics.append(base_name) del metrics_sparsity_dict # @added 20210617 - Feature #4144: webapp - stale_metrics API endpoint # On webapp report on sparsely populated metrics as well exclude_sparsely_populated = False if current_skyline_app == 'webapp': try: exclude_sparsely_populated = redis_conn_decoded.get( 'webapp.stale_metrics.exclude_sparsely_populated') if log: current_logger.info( '%s :: Redis key webapp.stale_metrics.exclude_sparsely_populated - %s' % (function_str, str(exclude_sparsely_populated))) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis key webapp.stale_metrics.exclude_sparsely_populated - %s' % (function_str, e)) if not exclude_sparsely_populated: sparsely_populated_metrics = [] # Get all alerted on stale metrics alerted_on_stale_metrics_hash_key = 'thunder.alerted_on.stale_metrics' try: alerted_on_stale_metrics_dict = redis_conn_decoded.hgetall( alerted_on_stale_metrics_hash_key) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis hash key %s - %s' % (function_str, alerted_on_stale_metrics_hash_key, e)) alerted_on_stale_metrics = [] if alerted_on_stale_metrics_dict: alerted_on_stale_metrics = list(alerted_on_stale_metrics_dict.keys()) # @added 20210617 - Feature #4144: webapp - stale_metrics API endpoint # On webapp report on alerted on metrics as well if current_skyline_app == 'webapp': alerted_on_stale_metrics = [] # Get all the known custom stale periods custom_stale_metrics_dict = {} custom_stale_metrics_hash_key = 'analyzer.metrics_manager.custom_stale_periods' try: custom_stale_metrics_dict = redis_conn_decoded.hgetall( custom_stale_metrics_hash_key) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to create custom_stale_metrics_dict from Redis hash key %s - %s' % (function_str, custom_stale_metrics_hash_key, e)) custom_stale_metrics = [] if custom_stale_metrics_dict: custom_stale_metrics = list(custom_stale_metrics_dict.keys()) metrics_last_timestamps = [] parent_namespaces = [] unique_base_names = list(metrics_last_timestamp_dict.keys()) last_traceback = None last_error = None error_count = 0 for base_name in unique_base_names: try: parent_namespace = base_name.split('.')[0] metrics_last_timestamps.append( [base_name, int(metrics_last_timestamp_dict[base_name])]) if len(parent_namespace) > 0: parent_namespaces.append(parent_namespace) except Exception as e: last_traceback = traceback.format_exc() last_error = e error_count += 1 if last_error: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: errors %s encounterd while creating metrics_last_timestamps, last reported error - %s' % (function_str, str(error_count), last_error)) current_logger.error('error :: %s :: last reported Traceback' % (function_str)) current_logger.error('%s' % (str(last_traceback))) total_stale_metrics_count = 0 total_recovered_metrics_count = 0 test_stale_metrics_namespaces = [] # @added 20220208 - Feature #4376: webapp - update_external_settings # If alert_on_stale_metrics is not enabled for an external_settings namespace # do not alert do_not_alert_on_namespaces = [] parent_namespaces = list(set(parent_namespaces)) # @added 20210620 - Branch #1444: thunder # Feature #4076: CUSTOM_STALE_PERIOD # Handle multi level namespaces external_settings = {} try: external_settings = get_external_settings(current_skyline_app) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: get_external_settings failed - %s' % (function_str, e)) external_parent_namespaces_stale_periods = {} if external_settings: for config_id in list(external_settings.keys()): alert_on_stale_metrics = False try: alert_on_stale_metrics = external_settings[config_id][ 'alert_on_stale_metrics']['enabled'] except KeyError: alert_on_stale_metrics = False stale_metrics_stale_period = settings.STALE_PERIOD if alert_on_stale_metrics: try: stale_metrics_stale_period = external_settings[config_id][ 'alert_on_stale_metrics']['stale_period'] except KeyError: stale_metrics_stale_period = settings.STALE_PERIOD namespace = None if stale_metrics_stale_period: try: namespace = external_settings[config_id]['namespace'] except KeyError: namespace = False # @added 20220208 - Feature #4376: webapp - update_external_settings # If alert_on_stale_metrics is not enabled do not alert if not alert_on_stale_metrics: do_not_alert_on_namespaces.append(namespace) namespace = None try: expiry = external_settings[config_id][ 'alert_on_stale_metrics']['expiry'] except KeyError: expiry = 1800 if namespace and alert_on_stale_metrics and expiry: external_parent_namespaces_stale_periods[parent_namespace] = {} external_parent_namespaces_stale_periods[parent_namespace][ 'stale_period'] = int(stale_metrics_stale_period) external_parent_namespaces_stale_periods[parent_namespace][ 'expiry'] = int(expiry) external_parent_namespaces = [] if external_parent_namespaces: # external_parent_namespaces = list(external_parent_namespaces.keys()) external_parent_namespaces = list( external_parent_namespaces_stale_periods.keys()) parent_namespace_metrics_processed = [] custom_stale_period_namespaces = [] # Sort the list by the namespaces with the most elements to the least as # first match wins if settings.CUSTOM_STALE_PERIOD: custom_stale_period_namespaces = list( settings.CUSTOM_STALE_PERIOD.keys()) custom_stale_period_namespaces_elements_list = [] for custom_stale_period_namespace in custom_stale_period_namespaces: namespace_elements = len(custom_stale_period_namespace.split('.')) custom_stale_period_namespaces_elements_list.append( [custom_stale_period_namespace, namespace_elements]) sorted_custom_stale_period_namespaces = sorted( custom_stale_period_namespaces_elements_list, key=lambda x: (x[1]), reverse=True) if sorted_custom_stale_period_namespaces: custom_stale_period_namespaces = [ x[0] for x in sorted_custom_stale_period_namespaces ] # Order by setting priority parent_namespaces = external_parent_namespaces + custom_stale_period_namespaces + parent_namespaces for parent_namespace in parent_namespaces: # @added 20220208 - Feature #4376: webapp - update_external_settings # If alert_on_stale_metrics is not enabled do not alert if parent_namespace in do_not_alert_on_namespaces: continue parent_namespace_stale_metrics_count = 0 namespace_stale_metrics_dict[parent_namespace] = {} namespace_stale_metrics_dict[parent_namespace]['metrics'] = {} namespace_recovered_metrics_dict[parent_namespace] = {} namespace_recovered_metrics_dict[parent_namespace]['metrics'] = {} # metrics that are in the parent namespace parent_namespace_metrics = [ item for item in metrics_last_timestamps if str(item[0]).startswith(parent_namespace) ] unfiltered_parent_namespace_metrics_count = len( parent_namespace_metrics) # @added 20210620 - Branch #1444: thunder # Feature #4076: CUSTOM_STALE_PERIOD # Handle multi level namespaces by filtering out metrics that have # already been processed in a longer parent_namespace parent_namespace_metrics = [ item for item in parent_namespace_metrics if str(item[0]) not in parent_namespace_metrics_processed ] if parent_namespace_metrics: parent_namespace_metric_names = [ item[0] for item in parent_namespace_metrics ] parent_namespace_metrics_processed = parent_namespace_metrics_processed + parent_namespace_metric_names if log: current_logger.info( '%s :: checking stale metrics in the \'%s.\' namespace on %s metrics (of %s filtered by processed)' % (function_str, parent_namespace, str(len(parent_namespace_metrics)), str(unfiltered_parent_namespace_metrics_count))) # Now check metrics that are default STALE_PERIOD metrics and are not # CUSTOM_STALE_PERIOD metrics last_error = None stale_period_parent_namespace_metrics = [ item for item in parent_namespace_metrics if item[0] not in custom_stale_metrics ] for base_name, timestamp in stale_period_parent_namespace_metrics: if base_name in sparsely_populated_metrics: continue try: # Only alert once on stale metrics and identify as recovered if base_name in alerted_on_stale_metrics: if int(timestamp) > (now - settings.STALE_PERIOD): namespace_recovered_metrics_dict[parent_namespace][ 'metrics'][base_name] = int(timestamp) total_recovered_metrics_count += 1 else: continue if int(timestamp) < (now - settings.STALE_PERIOD): # Determine the metric sparsity if it is not known if base_name not in base_names_of_known_sparsity: success = None sparsity = None try: success, sparsity = get_sparsity(base_name) if sparsity is not None: if float( sparsity ) < settings.SPARSELY_POPULATED_PERCENTAGE: if current_skyline_app == 'analyzer': sparsely_populated_metrics.append( base_name) continue if current_skyline_app == 'webapp' and exclude_sparsely_populated: sparsely_populated_metrics.append( base_name) continue else: if success is not True: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error( 'error :: %s :: get_sparsity failed for %s - %s' % (function_str, base_name, str(success))) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error( 'error :: %s :: get_sparsity failed for %s - %s' % (function_str, base_name, e)) namespace_stale_metrics_dict[parent_namespace]['metrics'][ base_name] = timestamp total_stale_metrics_count += 1 parent_namespace_stale_metrics_count += 1 except Exception as e: last_traceback = traceback.format_exc() last_error = e error_count += 1 if last_error: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: errors %s encounterd while determining stale_period_parent_namespace_metrics, last reported error - %s' % (function_str, str(error_count), last_error)) current_logger.error('error :: %s :: last reported Traceback' % (function_str)) current_logger.error('%s' % (str(last_traceback))) # Now check metrics that are CUSTOM_STALE_PERIOD metrics custom_stale_period_parent_namespace_metrics = [ item for item in parent_namespace_metrics if item[0] in custom_stale_metrics ] last_error = None for base_name, timestamp in custom_stale_period_parent_namespace_metrics: if base_name in sparsely_populated_metrics: continue try: # Only alert once on stale metrics and identify as recovered if base_name in alerted_on_stale_metrics: if int(timestamp) > ( now - int(custom_stale_metrics_dict[base_name])): namespace_recovered_metrics_dict[parent_namespace][ 'metrics'][base_name] = int(timestamp) total_recovered_metrics_count += 1 else: continue if int(timestamp) < ( now - int(custom_stale_metrics_dict[base_name])): # Determine the metric sparsity if it is not known if base_name not in base_names_of_known_sparsity: success = None sparsity = None try: success, sparsity = get_sparsity(base_name) if sparsity is not None: if float( sparsity ) < settings.SPARSELY_POPULATED_PERCENTAGE: # @modified 20210617 - Feature #4144: webapp - stale_metrics API endpoint # On webapp report on sparsely_populated_metrics on metrics as well if current_skyline_app == 'analyzer': sparsely_populated_metrics.append( base_name) continue if current_skyline_app == 'webapp' and exclude_sparsely_populated: sparsely_populated_metrics.append( base_name) continue else: if success is not True: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error( 'error :: %s :: get_sparsity failed for %s - %s' % (function_str, base_name, str(success))) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error( 'error :: %s :: get_sparsity failed for %s - %s' % (function_str, base_name, e)) namespace_stale_metrics_dict[parent_namespace]['metrics'][ base_name] = timestamp total_stale_metrics_count += 1 parent_namespace_stale_metrics_count += 1 except Exception as e: last_traceback = traceback.format_exc() last_error = e error_count += 1 if last_error: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error( 'error :: %s :: errors %s encounterd while determining custom_stale_period_parent_namespace_metrics, last reported error - %s' % (function_str, str(error_count), last_error)) current_logger.error('error :: %s :: last reported Traceback' % (function_str)) current_logger.error('%s' % (str(last_traceback))) if parent_namespace_stale_metrics_count: if log: current_logger.info( '%s :: %s stale metrics found for %s' % (function_str, str(parent_namespace_stale_metrics_count), parent_namespace)) # Allow to test if not parent_namespace_stale_metrics_count: # Allow to test thunder_test_alert_key_data = None thunder_test_alert_key = 'thunder.test.alert.stale_metrics.%s' % parent_namespace try: thunder_test_alert_key_data = redis_conn_decoded.get( thunder_test_alert_key) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis key %s - %s' % (function_str, thunder_test_alert_key, e)) if thunder_test_alert_key_data: try: thunder_test_data = literal_eval( thunder_test_alert_key_data) stale_period = thunder_test_data['stale_period'] expiry = thunder_test_data['expiry'] stale_count = thunder_test_data['stale_count'] if log: current_logger.info( '%s :: THUNDER STALE_METRICS TEST REQUESTED FOR - \'%s.\' namespace using TEST stale_period of %s and expiry of %s for %s metrics' % (function_str, parent_namespace, str(stale_period), str(expiry), str(stale_count))) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get stale_period, expiry and stale_count for Redis key %s - %s' % (function_str, thunder_test_alert_key, e)) for base_name, timestamp in parent_namespace_metrics[ -stale_count:]: namespace_stale_metrics_dict[parent_namespace]['metrics'][ base_name] = timestamp total_stale_metrics_count += 1 parent_namespace_stale_metrics_count += 1 test_stale_metrics_count = len( list(namespace_stale_metrics_dict[parent_namespace] ['metrics'].keys())) test_stale_metrics_namespaces.append(parent_namespace) if log: current_logger.info( '%s :: THUNDER STALE_METRICS TEST REQUESTED FOR - \'%s.\' namespace sending %s TEST stale_metrics' % (function_str, parent_namespace, str(test_stale_metrics_count))) if log: current_logger.info('%s :: total stale metrics found - %s' % (function_str, str(total_stale_metrics_count))) current_logger.info('%s :: total recovered stale metrics - %s' % (function_str, str(total_recovered_metrics_count))) current_logger.info( '%s :: skipped checking %s sparsely_populated_metrics' % (function_str, str(len(sparsely_populated_metrics)))) # @modified 20210617 - Feature #4144: webapp - stale_metrics API endpoint # On webapp request do not send thunder events # if namespace_stale_metrics_dict: if namespace_stale_metrics_dict and current_skyline_app == 'analyzer': parent_namespaces = list(namespace_stale_metrics_dict.keys()) for parent_namespace in parent_namespaces: stale_metrics = list(namespace_stale_metrics_dict[parent_namespace] ['metrics'].keys()) if len(stale_metrics) > 0: # Check if there is a thunder.alert.no_data Redis key for the # namespace and skip if there is thunder_no_data_alert_key_exists = False thunder_no_data_alert_key = 'thunder.alert.no_data.%s' % parent_namespace try: thunder_no_data_alert_key_exists = redis_conn_decoded.get( thunder_no_data_alert_key) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis key %s - %s' % (function_str, thunder_no_data_alert_key, e)) if thunder_no_data_alert_key_exists: if log: current_logger.info( '%s :: skipping sending thunder event for stale metrics on %s as thunder no_data alert key exists for the namespace' % (function_str, parent_namespace)) continue # Check if there is a thunder.alert.analyzer.up.alert Redis key for the # namespace and skip if there is thunder_analyzer_alert_key_exists = False thunder_analyzer_alert_key = 'thunder.alert.analyzer.up.alert' try: thunder_analyzer_alert_key_exists = redis_conn_decoded.get( thunder_analyzer_alert_key) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to get Redis key %s - %s' % (function_str, thunder_analyzer_alert_key, e)) if thunder_analyzer_alert_key_exists: if log: current_logger.info( '%s :: skipping sending thunder event for stale metrics on %s as thunder analyzer alert key exists' % (function_str, parent_namespace)) continue level = 'alert' event_type = 'stale_metrics' message = '%s - %s - no new data for %s metrics' % ( level, parent_namespace, str(len(stale_metrics))) status = 'not recieving data for some metrics' if parent_namespace in test_stale_metrics_namespaces: message = '%s - %s - no new data for %s metrics - TEST' % ( level, parent_namespace, str(len(stale_metrics))) status = 'not recieving data for some metrics - TEST' thunder_event = { 'level': level, 'event_type': event_type, 'message': message, 'app': current_skyline_app, 'metric': None, 'source': current_skyline_app, 'timestamp': time(), 'expiry': settings.STALE_PERIOD, 'data': { 'namespace': parent_namespace, 'stale_metrics': stale_metrics, 'status': status, }, } submitted = False try: submitted = thunder_send_event(current_skyline_app, thunder_event, log=True) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error( 'error :: %s :: error encounterd with thunder_send_event - %s' % (function_str, e)) if submitted: if log: current_logger.info( '%s :: send thunder event for %s stale metrics on namespace %s' % (function_str, str( len(stale_metrics)), parent_namespace)) # @modified 20210617 - Feature #4144: webapp - stale_metrics API endpoint # On webapp request do not send thunder events # if namespace_recovered_metrics_dict and total_recovered_metrics_count: if namespace_recovered_metrics_dict and total_recovered_metrics_count and current_skyline_app == 'analyzer': parent_namespaces = list(namespace_recovered_metrics_dict.keys()) for parent_namespace in parent_namespaces: stale_metrics = list( namespace_recovered_metrics_dict[parent_namespace] ['metrics'].keys()) if len(stale_metrics) > 0: level = 'notice' event_type = 'stale_metrics' message = '%s - %s - new data for %s metrics' % ( level, parent_namespace, str(len(stale_metrics))) status = 'recovered' thunder_event = { 'level': level, 'event_type': event_type, 'message': message, 'app': current_skyline_app, 'metric': None, 'source': current_skyline_app, 'timestamp': time(), 'expiry': 59, 'data': { 'namespace': parent_namespace, 'stale_metrics': stale_metrics, 'status': status, }, } submitted = False try: submitted = thunder_send_event(current_skyline_app, thunder_event, log=True) except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger( current_skyline_app_logger) current_logger.error( 'error :: %s :: error encounterd with thunder_send_event - %s' % (function_str, e)) if submitted: if log: current_logger.info( '%s :: send thunder event for %s stale metrics on namespace %s' % (function_str, str( len(stale_metrics)), parent_namespace)) return namespace_stale_metrics_dict, namespace_recovered_metrics_dict
# @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes # types need to be decoded as utf-8 to str # if settings.REDIS_PASSWORD: # redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Added a single functions to deal with Redis connection and the # charset='utf-8', decode_responses=True arguments required in py3 redis_conn = get_redis_conn(skyline_app) redis_conn_decoded = get_redis_conn_decoded(skyline_app) def get_anomaly(request_type): """ Query the database for the anomaly details """ logger = logging.getLogger(skyline_app_logger) if isinstance(request_type, int): latest = False else: latest = True if latest: