def thunder_send_event(current_skyline_app, event, log=True): """ Add an event to the thunder.events Redis set or the thunder check dir if Redis is not available. :param current_skyline_app: the app calling the function :param event: the event data :param log: whether to log or not, optional, defaults to False :type current_skyline_app: str :type event: dict :type log: boolean :return: submitted :rtype: boolean """ function_str = 'functions.thunder.thunder_sent_event' if log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) else: current_logger = None submitted = 0 try: redis_conn = get_redis_conn(current_skyline_app) submitted = redis_conn.sadd('thunder.events', str(event)) if submitted: return True except Exception as e: if not log: current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) current_logger.error(traceback.format_exc()) current_logger.error( 'error :: %s :: failed to add %s to thunder.events Redis set - %s' % (function_str, str(event), e)) # If the thunder event was not added to Redis set, create the event_file if not path.exists(THUNDER_EVENTS_DIR): mkdir_p(THUNDER_EVENTS_DIR) current_logger.info('created dir - %s' % THUNDER_EVENTS_DIR) event_file = '%s/%s.thunder.event.dict' % (THUNDER_EVENTS_DIR, str(time())) try: write_data_to_file(current_skyline_app, event_file, 'w', str(event)) current_logger.info('added thunder event file - %s' % event_file) submitted = True except Exception as e: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: failed to add thunder event file - %s - %s' % (event_file, e)) submitted = False return submitted
def run_algorithms(timeseries, timeseries_name, end_timestamp, full_duration, timeseries_file, skyline_app, algorithms): """ Iteratively run algorithms. """ results_dir = os.path.dirname(timeseries_file) if not os.path.exists(results_dir): os.makedirs(results_dir, mode=0o755) start_analysis = int(time.time()) triggered_algorithms = [] anomalous = False check_algorithms = [] if str(algorithms) == "['all']": if skyline_app == 'analyzer': check_algorithms = ALGORITHMS logger.info('check_algorithms for analyzer - %s' % (str(check_algorithms))) if skyline_app == 'mirage': check_algorithms = MIRAGE_ALGORITHMS logger.info('check_algorithms for mirage - %s' % (str(check_algorithms))) if skyline_app == 'boundary': check_algorithms = algorithms logger.info('check_algorithms for boundary - %s' % (str(check_algorithms))) if skyline_app == 'crucible': ALGORITHMS.append('detect_drop_off_cliff') check_algorithms = ALGORITHMS logger.info('check_algorithms for crucible - %s' % (str(check_algorithms))) else: check_algorithms = algorithms logger.info('check_algorithms specified - %s' % (str(check_algorithms))) if not check_algorithms: logger.info('check_algorithms unknown - %s' % (str(check_algorithms))) ALGORITHMS.append('detect_drop_off_cliff') check_algorithms = ALGORITHMS logger.info('check_algorithms - %s' % (str(check_algorithms))) logger.info('checking algorithms - %s' % (str(check_algorithms))) # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png # Plot Skyline anomalies if CONSENSUS is achieved anomalies = [] for algorithm in check_algorithms: detected = '' try: x_vals = np.arange(len(timeseries)) y_vals = np.array([y[1] for y in timeseries]) # Match default graphite graph size plt.figure(figsize=(5.86, 3.08), dpi=100) plt.plot(x_vals, y_vals) # Start a couple datapoints in for the tail average for index in range(10, len(timeseries)): sliced = timeseries[:index] anomaly = globals()[algorithm](sliced, end_timestamp, full_duration) # Point out the datapoint if it's anomalous if anomaly: plt.plot([index], [sliced[-1][1]], 'ro') detected = "DETECTED" # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png # Add the anomaly to the anomalies list to plot Skyline # anomalies if CONSENSUS is achieved anomalies.append([sliced[-1][0], sliced[-1][1], algorithm]) if detected == "DETECTED": results_filename = join(results_dir + "/" + algorithm + "." + detected + ".png") # logger.info('ANOMALY DETECTED :: %s' % (algorithm)) anomalous = True triggered_algorithms.append(algorithm) else: results_filename = join(results_dir + "/" + algorithm + ".png") plt.savefig(results_filename, dpi=100) # logger.info('%s :: %s' % (algorithm, results_filename)) if python_version == 2: os.chmod(results_filename, 0644) if python_version == 3: os.chmod(results_filename, mode=0o644) except: logger.error('error :: %s' % (traceback.format_exc())) logger.info( 'info :: error thrown in algorithm running and plotting - %s' % (str(algorithm))) end_analysis = int(time.time()) # @modified 20160814 - pyflaked # seconds_to_run = end_analysis - start_analysis # logger.info( # 'analysis of %s at a full duration of %s took %s seconds' % # (timeseries_name, str(full_duration), str(seconds_to_run))) # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png # Plot Skyline anomalies where CONSENSUS achieved and create file resources # skyline.anomalies_score.txt and skyline.anomalies.csv anomalies_score = [] if anomalies: for ts, value, algo in anomalies: processed = False algorithms_triggered = [] if anomalies_score: for i in anomalies_score: if i[0] == ts: processed = True continue if processed: continue for w_ts, w_value, w_algo in anomalies: if w_ts == ts: algorithms_triggered.append(w_algo) if algorithms_triggered: consensus = len(algorithms_triggered) anomalies_score.append( [ts, value, consensus, algorithms_triggered]) try: logger.info('info :: plotting skyline.consensus.anomalies.png') x_vals = np.arange(len(timeseries)) y_vals = np.array([y[1] for y in timeseries]) # Match default graphite graph size plt.figure(figsize=(5.86, 3.08), dpi=100) plt.plot(x_vals, y_vals) for index in range(10, len(timeseries)): anomaly = False sliced = timeseries[:index] for i in anomalies_score: if sliced[-1][0] == i[0]: if i[2] >= CONSENSUS: anomaly = True # Point out the datapoint if it is anomalous according to # Skyline CONSENSUS if anomaly: plt.plot([index], [sliced[-1][1]], 'ro') results_filename = join(results_dir + "/skyline.consensus.anomalies.png") plt.savefig(results_filename, dpi=100) if python_version == 2: os.chmod(results_filename, 0644) if python_version == 3: os.chmod(results_filename, mode=0o644) except: logger.error('error :: %s' % (traceback.format_exc())) logger.error( 'error :: falied plotting skyline.consensus.anomalies.png') anomalies_filename = join(results_dir + "/skyline.anomalies_score.txt") write_data_to_file(skyline_app, anomalies_filename, 'w', str(anomalies_score)) anomalies_csv = join(results_dir + "/skyline.anomalies.csv") try: with open(anomalies_csv, 'w') as fh: fh.write( 'timstamp,value,consensus_count,triggered_algorithms\n') for ts, value, consensus, algorithms_triggered in anomalies_score: try: algos_str = str(algorithms_triggered) triggered_algorithms = algos_str.replace(',', ' ') line = '%s,%s,%s,%s\n' % (str(ts), str(value), str(consensus), str(triggered_algorithms)) with open(anomalies_csv, 'a') as fh: fh.write(line) except: logger.error(traceback.format_exc()) logger.error('error :: could not write to file %s' % (anomalies_csv)) if python_version == 2: os.chmod(anomalies_csv, 0644) if python_version == 3: os.chmod(anomalies_csv, mode=0o644) except: logger.error(traceback.format_exc()) logger.error('error :: could not write to file %s' % (anomalies_csv)) return anomalous, triggered_algorithms
def alert_slack(datapoint, metric_name, expiration_time, metric_trigger, algorithm): if not settings.SLACK_ENABLED: return False from slackclient import SlackClient metric = metric_name logger.info('alert_slack - anomalous metric :: metric: %s - %s' % (metric, algorithm)) base_name = metric alert_algo = str(algorithm) alert_context = alert_algo.upper() # The known_derivative_metric state is determine in case we need to surface # the png image from Graphite if the Ionosphere image is not available for # some reason. This will result in Skyline at least still sending an alert # to slack, even if some gear fails in Ionosphere or slack alerting is used # without Ionosphere enabled. Yes not DRY but multiprocessing and spawn # safe. known_derivative_metric = False try: if settings.REDIS_PASSWORD: REDIS_ALERTER_CONN = redis.StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: REDIS_ALERTER_CONN = redis.StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) except: logger.error('error :: alert_slack - redis connection failed') try: derivative_metrics = list( REDIS_ALERTER_CONN.smembers('derivative_metrics')) except: derivative_metrics = [] redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(base_name)) if redis_metric_name in derivative_metrics: known_derivative_metric = True if known_derivative_metric: try: non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS except: non_derivative_monotonic_metrics = [] skip_derivative = in_list(redis_metric_name, non_derivative_monotonic_metrics) if skip_derivative: known_derivative_metric = False if known_derivative_metric: unencoded_graph_title = 'Skyline Boundary - ALERT %s at %s hours - derivative graph - %s' % ( alert_context, str(graphite_previous_hours), metric) slack_title = '*Skyline Boundary - ALERT* %s on %s at %s hours - derivative graph - %s' % ( alert_context, metric, str(graphite_previous_hours), datapoint) else: unencoded_graph_title = 'Skyline Boundary - ALERT %s at %s hours - %s' % ( alert_context, str(graphite_previous_hours), metric) slack_title = '*Skyline Boundary - ALERT* %s on %s at %s hours - %s' % ( alert_context, metric, str(graphite_previous_hours), datapoint) graph_title_string = quote(unencoded_graph_title, safe='') graph_title = '&title=%s' % graph_title_string until_timestamp = int(time()) target_seconds = int((graphite_previous_hours * 60) * 60) from_timestamp = str(until_timestamp - target_seconds) graphite_from = dt.datetime.fromtimestamp( int(from_timestamp)).strftime('%H:%M_%Y%m%d') logger.info('graphite_from - %s' % str(graphite_from)) graphite_until = dt.datetime.fromtimestamp( int(until_timestamp)).strftime('%H:%M_%Y%m%d') logger.info('graphite_until - %s' % str(graphite_until)) # @added 20181025 - Feature #2618: alert_slack # Added date and time info so you do not have to mouseover the slack # message to determine the time at which the alert came in timezone = strftime("%Z", gmtime()) # @modified 20181029 - Feature #2618: alert_slack # Use the standard UNIX data format # human_anomaly_time = dt.datetime.fromtimestamp(int(until_timestamp)).strftime('%Y-%m-%d %H:%M:%S') human_anomaly_time = dt.datetime.fromtimestamp( int(until_timestamp)).strftime('%c') slack_time_string = '%s %s' % (human_anomaly_time, timezone) if settings.GRAPHITE_PORT != '': if known_derivative_metric: link = '%s://%s:%s/render/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, settings.GRAPHITE_PORT, str(graphite_from), str(graphite_until), metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title) else: link = '%s://%s:%s/render/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, settings.GRAPHITE_PORT, str(graphite_from), str(graphite_until), metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title) else: if known_derivative_metric: link = '%s://%s/render/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, str(graphite_from), str(graphite_until), metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title) else: link = '%s://%s/render/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, str(graphite_from), str(graphite_until), metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title) # slack does not allow embedded images, nor will it fetch links behind # authentication so Skyline uploads a png graphite image with the message image_file = None # Fetch the png from Graphite try: image_data = urllib2.urlopen(link).read() # nosec except urllib2.URLError: logger.error(traceback.format_exc()) logger.error('error :: alert_slack - failed to get image graph') logger.error('error :: alert_slack - %s' % str(link)) image_data = None if image_data: image_file = '%s/%s.%s.graphite.%sh.png' % ( settings.SKYLINE_TMP_DIR, base_name, skyline_app, str(int(graphite_previous_hours))) try: write_data_to_file(skyline_app, image_file, 'w', image_data) logger.info('alert_slack - added Graphite image :: %s' % (image_file)) except: logger.info(traceback.format_exc()) logger.error( 'error :: alert_slack - failed to add %s Graphite image' % (image_file)) image_file = None try: filename = os.path.basename(image_file) except: filename = None try: bot_user_oauth_access_token = settings.BOUNDARY_SLACK_OPTS[ 'bot_user_oauth_access_token'] except: logger.error( 'error :: alert_slack - could not determine bot_user_oauth_access_token' ) return False # Allow for absolute path metric namespaces but also allow for and match # match wildcard namepaces if there is not an absolute path metric namespace channels = 'unknown' notify_channels = [] matched_channels = [] try: channels = settings.BOUNDARY_SLACK_OPTS['channels'][metric_name] notify_channels.append(channels) except: for channel in settings.BOUNDARY_SLACK_OPTS['channels']: CHECK_MATCH_PATTERN = channel check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(metric_name) if pattern_match: matched_channels.append(channel) if matched_channels != []: for i_metric_name in matched_channels: channels = settings.BOUNDARY_SLACK_OPTS['channels'][i_metric_name] notify_channels.append(channels) if not notify_channels: logger.error('error :: alert_slack - could not determine channel') return False else: channels = notify_channels try: icon_emoji = settings.BOUNDARY_SLACK_OPTS['icon_emoji'] except: icon_emoji = ':chart_with_upwards_trend:' try: sc = SlackClient(bot_user_oauth_access_token) except: logger.info(traceback.format_exc()) logger.error('error :: alert_slack - could not initiate SlackClient') return False for channel in channels: initial_comment = slack_title + ' :: <' + link + '|graphite image link>\nFor anomaly at ' + slack_time_string try: # slack does not allow embedded images, nor links behind authentication # or color text, so we have jump through all the API hoops to end up # having to upload an image with a very basic message. if os.path.isfile(image_file): slack_file_upload = sc.api_call( 'files.upload', filename=filename, channels=channel, initial_comment=initial_comment, file=open(image_file, 'rb')) if not slack_file_upload['ok']: logger.error( 'error :: alert_slack - failed to send slack message') else: send_text = initial_comment + ' :: error :: there was no graph image to upload' send_message = sc.api_call('chat.postMessage', channel=channel, icon_emoji=icon_emoji, text=send_text) if not send_message['ok']: logger.error( 'error :: alert_slack - failed to send slack message') else: logger.info('alert_slack - sent slack message') except: logger.info(traceback.format_exc()) logger.error('error :: alert_slack - could not upload file') return False
def calculate_features_profile(current_skyline_app, timestamp, metric, context): """ Calculates a tsfresh features profile from a training data set :param timestamp: the timestamp of metric anomaly with training data :type timestamp: str :param metric: the base_name of the metric :type metric: str :return: (features_profile_csv_file_path, successful, fail_msg, traceback_format_exc, calc_time) :rtype: int :rtype: (str, boolean, str, str, str) """ current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) base_name = str(metric) if context == 'training_data': log_context = 'training data' if context == 'features_profiles': log_context = 'features profile data' if context == 'ionosphere': log_context = 'ionosphere' # @added 20170114 - Feature #1854: Ionosphere learn if context == 'ionosphere_learn': log_context = 'ionosphere :: learn' current_logger.info('%s feature profile creation requested for %s at %s' % (log_context, base_name, timestamp)) timeseries_dir = base_name.replace('.', '/') if context == 'training_data' or context == 'ionosphere': metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER, timestamp, timeseries_dir) if context == 'features_profiles': metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_PROFILES_FOLDER, timeseries_dir, timestamp) # @added 20170113 - Feature #1854: Ionosphere learn if context == 'ionosphere_learn': metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_LEARN_FOLDER, timestamp, timeseries_dir) features_profile_created_file = '%s/%s.%s.fp.created.txt' % ( metric_data_dir, str(timestamp), base_name) features_profile_details_file = '%s/%s.%s.fp.details.txt' % ( metric_data_dir, str(timestamp), base_name) # @added 20170108 - Feature #1842: Ionosphere - Graphite now graphs # Added metric_check_file and ts_full_duration is needed to be determined # and added the to features_profile_details_file as it was not added here on # the 20170104 when it was added the webapp and ionosphere metric_var_filename = '%s.txt' % str(base_name) anomaly_check_file = '%s/%s' % (metric_data_dir, metric_var_filename) ts_full_duration = int(settings.FULL_DURATION) if os.path.isfile(anomaly_check_file): # Read the details file with open(anomaly_check_file, 'r') as f: anomaly_details = f.readlines() for i, line in enumerate(anomaly_details): if 'full_duration' in line: _ts_full_duration = '%s' % str(line).split("'", 2) full_duration_array = literal_eval(_ts_full_duration) ts_full_duration = str(int(full_duration_array[1])) anomaly_json = '%s/%s.json' % (metric_data_dir, base_name) ts_csv = '%s/%s.tsfresh.input.csv' % (metric_data_dir, base_name) # anomaly_json = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.json' # ts_csv = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.tsfresh.input.csv' # This is simply to stay in line with tsfresh naming conventions in their # docs and examples fname_in = ts_csv t_fname_out = fname_in + '.features.transposed.csv' fp_id = None f_calc = 'unknown' if os.path.isfile(features_profile_details_file): current_logger.info('features profile details file exist - %s' % (features_profile_details_file)) try: with open(features_profile_details_file, 'r') as f: fp_details_str = f.read() fp_details_array = literal_eval(fp_details_str) f_calc = ' (previously calculated by Ionosphere) - %s' % str( fp_details_array[2]) except: trace = traceback.format_exc() current_logger.error(trace) current_logger.error('error: failed to read from %s' % (features_profile_details_file)) else: current_logger.info('OK no features profile details file exists - %s' % (features_profile_details_file)) fp_created = None if os.path.isfile(features_profile_created_file): current_logger.info('features profile created file exist - %s' % (features_profile_created_file)) try: with open(features_profile_created_file, 'r') as f: fp_created_str = f.read() fp_created_array = literal_eval(fp_created_str) fp_id = fp_created_array[0] fp_created = True except: trace = traceback.format_exc() current_logger.error(trace) current_logger.error('error: failed to read fp_id from %s' % (features_profile_created_file)) else: current_logger.info('OK no features profile created file exists - %s' % (features_profile_created_file)) if os.path.isfile(t_fname_out): current_logger.info('transposed features already exist - %s' % (t_fname_out)) return str( t_fname_out), True, fp_created, fp_id, 'none', 'none', f_calc start = timer() if os.path.isfile(anomaly_json): try: # Read the timeseries json file with open(anomaly_json, 'r') as f: raw_timeseries = f.read() except: trace = traceback.format_exc() current_logger.error(trace) current_logger.error( 'error: failed to read timeseries data from %s' % (anomaly_json)) fail_msg = 'error: failed to read timeseries data from %s' % anomaly_json end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc else: trace = 'none' fail_msg = 'error: file not found - %s' % (anomaly_json) current_logger.error(fail_msg) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc # Convert the timeseries to csv timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']') timeseries = literal_eval(timeseries_array_str) datapoints = timeseries converted = [] for datapoint in datapoints: try: new_datapoint = [float(datapoint[0]), float(datapoint[1])] converted.append(new_datapoint) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests except: # nosec continue if os.path.isfile(ts_csv): os.remove(ts_csv) for ts, value in converted: # print('%s,%s' % (str(int(ts)), str(value))) utc_ts_line = '%s,%s,%s\n' % (metric, str(int(ts)), str(value)) with open(ts_csv, 'a') as fh: fh.write(utc_ts_line) try: df = pd.read_csv(ts_csv, delimiter=',', header=None, names=['metric', 'timestamp', 'value']) current_logger.info('DataFrame created with %s' % ts_csv) except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error: failed to create a pandas DataFrame with %s' % ts_csv current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed %s' % ts_csv) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc # @added 20161207 - Task #1658: Patterning Skyline Ionosphere # Coverting the Dataframe types to suit MySQL data types # For anyone in here if you have done a code review of Skyline there are # a number of questions that arise from the decision to deviate from json or # storing msgppack as BLOB etc. tsfresh used csv and we can csv from Graphite # etc. Skyline should be able to handle csv. As for how data is stored in # MySQL, this was given considerable review and thought. Given that Ionosphere # and Skyline in general should not be limited to the domain of analyzing # Graphite machine metrics but other timeseries data sources too. # df['feature_name'] = df['feature_name'].astype(string) # df['value'] = df['value'].astype(float) # Test the DataFrame try: df_created = df.head() del df_created except: trace = traceback.format_exc() current_logger.debug(trace) fail_msg = 'error: failed to read the pandas DataFrame created with %s' % ts_csv current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed %s' % ts_csv) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc df.columns = ['metric', 'timestamp', 'value'] start_feature_extraction = timer() current_logger.info('starting extract_features with %s' % str(TSFRESH_VERSION)) try: # @modified 20161226 - Bug #1822: tsfresh extract_features process stalling # Changed to use the new ReasonableFeatureExtractionSettings that was # introduced in tsfresh-0.4.0 to exclude the computationally high cost # of extracting features from very static timeseries that has little to # no variation is the values, which results in features taking up to # almost 600 seconds to calculate on a timeseries of length 10075 # (168h - 1 datapoint per 60s) # In terms of inline feature calculatation, always exclude # high_comp_cost features. # df_features = extract_features(df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None) tsf_settings = ReasonableFeatureExtractionSettings() # Disable tqdm progress bar tsf_settings.disable_progressbar = True df_features = extract_features( df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None, feature_extraction_settings=tsf_settings) current_logger.info('features extracted from %s data' % ts_csv) except: trace = traceback.print_exc() current_logger.debug(trace) fail_msg = 'error: extracting features with tsfresh from - %s' % ts_csv current_logger.error('%s' % fail_msg) end_feature_extraction = timer() current_logger.info( 'feature extraction failed in %.6f seconds' % (end_feature_extraction - start_feature_extraction)) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed %s' % ts_csv) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc end_feature_extraction = timer() feature_extraction_time = end_feature_extraction - start_feature_extraction current_logger.info('feature extraction took %.6f seconds' % (feature_extraction_time)) # write to disk fname_out = fname_in + '.features.csv' # df_features.to_csv(fname_out) # Transpose try: df_t = df_features.transpose() current_logger.info('features transposed') except: trace = traceback.print_exc() current_logger.debug(trace) fail_msg = 'error :: transposing tsfresh features from - %s' % ts_csv current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed %s' % ts_csv) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc # Create transposed features csv t_fname_out = fname_in + '.features.transposed.csv' try: df_t.to_csv(t_fname_out) except: trace = traceback.print_exc() current_logger.debug(trace) fail_msg = 'error: saving transposed tsfresh features from - %s' % ts_csv current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed %s' % ts_csv) end = timer() return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc # Calculate the count and sum of the features values try: df_sum = pd.read_csv(t_fname_out, delimiter=',', header=0, names=['feature_name', 'value']) df_sum.columns = ['feature_name', 'value'] df_sum['feature_name'] = df_sum['feature_name'].astype(str) df_sum['value'] = df_sum['value'].astype(float) except: trace = traceback.print_exc() current_logger.error(trace) current_logger.error('error :: failed to create Dataframe to sum') try: features_count = len(df_sum['value']) except: trace = traceback.print_exc() current_logger.debug(trace) current_logger.error( 'error :: failed to count number of features, set to 0') features_count = 0 try: features_sum = df_sum['value'].sum() except: trace = traceback.print_exc() current_logger.debug(trace) current_logger.error('error :: failed to sum feature values, set to 0') features_sum = 0 end = timer() current_logger.info('features saved to %s' % (fname_out)) current_logger.info('transposed features saved to %s' % (t_fname_out)) total_calc_time = '%.6f' % (end - start) calc_time = '%.6f' % (feature_extraction_time) current_logger.info('total feature profile completed in %s seconds' % str(total_calc_time)) # Create a features profile details file try: # @modified 20170108 - Feature #1842: Ionosphere - Graphite now graphs # Added the ts_full_duration here as it was not added here on the 20170104 # when it was added the webapp and ionosphere data = '[%s, \'%s\', %s, %s, %s, %s]' % ( str(int(time.time())), str(tsfresh_version), str(calc_time), str(features_count), str(features_sum), str(ts_full_duration)) write_data_to_file(current_skyline_app, features_profile_details_file, 'w', data) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: failed to write %s' % features_profile_details_file current_logger.error('%s' % fail_msg) if os.path.isfile(ts_csv): os.remove(ts_csv) current_logger.info('removed the created csv - %s' % ts_csv) # @added 20170112 - Feature #1854: Ionosphere learn - Redis ionosphere.learn.work namespace # Ionosphere learn needs Redis works sets, but this was moved to # ionosphere_backend.py and learn.py not done here return str(t_fname_out), True, fp_created, fp_id, 'none', 'none', str( calc_time)
def run_algorithms(timeseries, timeseries_name, end_timestamp, full_duration, timeseries_file, skyline_app, algorithms, alert_interval, add_to_panorama, padded_timeseries, from_timestamp): """ Iteratively run algorithms. """ results_dir = os.path.dirname(timeseries_file) if not os.path.exists(results_dir): os.makedirs(results_dir, mode=0o755) start_analysis = int(time.time()) triggered_algorithms = [] anomalous = False # @added 20200427 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Added default alert_interval_discarded_anomalies_count so run_algorithms # does not return as failed alert_interval_discarded_anomalies_count = 0 check_algorithms = [] if str(algorithms) == "['all']": if skyline_app == 'analyzer': check_algorithms = ALGORITHMS logger.info('check_algorithms for analyzer - %s' % (str(check_algorithms))) if skyline_app == 'mirage': check_algorithms = MIRAGE_ALGORITHMS logger.info('check_algorithms for mirage - %s' % (str(check_algorithms))) if skyline_app == 'boundary': check_algorithms = algorithms logger.info('check_algorithms for boundary - %s' % (str(check_algorithms))) if skyline_app == 'crucible': ALGORITHMS.append('detect_drop_off_cliff') check_algorithms = ALGORITHMS logger.info('check_algorithms for crucible - %s' % (str(check_algorithms))) else: check_algorithms = algorithms logger.info('check_algorithms specified - %s' % (str(check_algorithms))) if not check_algorithms: logger.info('check_algorithms unknown - %s' % (str(check_algorithms))) ALGORITHMS.append('detect_drop_off_cliff') check_algorithms = ALGORITHMS logger.info('check_algorithms - %s' % (str(check_algorithms))) logger.info('checking algorithms - %s on %s' % (str(check_algorithms), str(timeseries_file))) # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png # Plot Skyline anomalies if CONSENSUS is achieved anomalies = [] # @added 20200422 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Added padded_timeseries. If the time series is padded then set # the range appropriately so that the padded period data points are not # analysed for anomalies default_range = 10 if padded_timeseries: default_range = 0 for ts, value in timeseries: if int(ts) < from_timestamp: default_range += 1 else: break logger.info('padded_timeseries - default range set to %s to %s' % (str(default_range), str(timeseries_file))) for algorithm in check_algorithms: detected = '' try: x_vals = np.arange(len(timeseries)) y_vals = np.array([y[1] for y in timeseries]) # Match default graphite graph size plt.figure(figsize=(5.86, 3.08), dpi=100) plt.plot(x_vals, y_vals) # Start a couple datapoints in for the tail average # @modified 20200422 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # If the time series is padded then use the appropriate range so # that the padded period data points are not analysed for anomalies # for index in range(10, len(timeseries)): for index in range(default_range, len(timeseries)): sliced = timeseries[:index] anomaly = globals()[algorithm](sliced, end_timestamp, full_duration) # Point out the datapoint if it's anomalous if anomaly: plt.plot([index], [sliced[-1][1]], 'ro') detected = "DETECTED" # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png # Add the anomaly to the anomalies list to plot Skyline # anomalies if CONSENSUS is achieved anomalies.append([sliced[-1][0], sliced[-1][1], algorithm]) if detected == "DETECTED": results_filename = join(results_dir + "/" + algorithm + "." + detected + ".png") logger.info('ANOMALY DETECTED :: with %s on %s' % (algorithm, str(timeseries_file))) anomalous = True triggered_algorithms.append(algorithm) else: results_filename = join(results_dir + "/" + algorithm + ".png") try: plt.savefig(results_filename, dpi=100) logger.info('saved %s plot :: %s' % (algorithm, results_filename)) if python_version == 2: # @modified 20200327 - Branch #3262: py3 # os.chmod(results_filename, 0644) os.chmod(results_filename, 0o644) if python_version == 3: os.chmod(results_filename, mode=0o644) except: logger.error('error :: %s' % (traceback.format_exc())) logger.error('error :: failed to save %s for %s' % (str(results_filename), str(timeseries_file))) except: logger.error('error :: %s' % (traceback.format_exc())) logger.error( 'error :: error thrown in algorithm running and plotting - %s on %s' % (str(algorithm), str(timeseries_file))) end_analysis = int(time.time()) # @modified 20160814 - pyflaked # seconds_to_run = end_analysis - start_analysis # logger.info( # 'analysis of %s at a full duration of %s took %s seconds' % # (timeseries_name, str(full_duration), str(seconds_to_run))) # @added 20200421 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Added last_anomaly_timestamp to apply alert_interval against and # alert_interval_discarded_anomalies. If the alert interval is passed # Crucible will only report Skyline CONSENSUS anomalies if the time between # the last anomaly is not alert_interval less than the specified # alert_interval period. This enables Crucible to mimic Analyzer and Mirage # and apply a EXPIRATION_TIME type methodology to identifying anomalies like # Analyzer would. This makes Crucible work SOMEWHAT like Analyzer, however # is still a bit different as with Crucible the time series grows, like a # new metric would. # Set the last_anomaly_timestamp to the appropriate timestamp before the # alert_interval if alert_interval is set, if it is not it does not matter # as alert_interval and alert_interval_discarded_anomalies will not be # applied. # @modified 20200427 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Wrap timeseries_start_timestamp variable in try so on fail the process # does not hang try: timeseries_start_timestamp = int(timeseries[0][0]) except: logger.error('error :: %s' % (traceback.format_exc())) logger.error( 'error :: failed to determine timeseries_start_timestamp from %s' % str(timeseries_file)) timeseries_start_timestamp = 0 # @modified 20200427 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # if alert_interval: last_anomaly_timestamp = timeseries_start_timestamp if alert_interval and timeseries_start_timestamp: last_anomaly_timestamp = timeseries_start_timestamp - (alert_interval + 1) else: last_anomaly_timestamp = timeseries_start_timestamp alert_interval_discarded_anomalies = [] # To apply alert_interval the anomalies object needs to be sorted by # timestamp as the anomalies are added per algorithm so they are not # timestamp ordered, but timestamp ordered per algorithm if anomalies and alert_interval: try: logger.info( 'info :: last_anomaly_timestamp set to %s for alert_interval check on %s' % (str(last_anomaly_timestamp), str(timeseries_file))) logger.info( 'info :: sorting anomalies %s to apply alert_interval check on %s' % (str(len(anomalies)), str(timeseries_file))) sorted_anomalies = sorted(anomalies, key=lambda x: x[0]) anomalies = sorted_anomalies del sorted_anomalies except: logger.error('error :: %s' % (traceback.format_exc())) logger.error('error :: falied to create sorted_anomalies on %s' % str(timeseries_file)) # @added 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms # Allow the user to pass run_algorithms to run use_consensus = 6 try: try: from settings import CONSENSUS as use_consensus except: logger.error(traceback.format_exc()) logger.error('error :: falied to set uSE_CONSENSUS') use_consensus = 6 if len(check_algorithms) <= use_consensus: use_consensus = len(check_algorithms) logger.info( 'check_algorithms passed with the number of algorithms less than CONSENSUS, use_consensus set to %s' % (str(use_consensus))) except: logger.error(traceback.format_exc()) logger.error('error :: falied to set CONSENSUS') # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png # Plot Skyline anomalies where CONSENSUS achieved and create file resources # skyline.anomalies_score.txt and skyline.anomalies.csv anomalies_score = [] if anomalies: for ts, value, algo in anomalies: try: processed = False algorithms_triggered = [] if anomalies_score: for i in anomalies_score: if i[0] == ts: processed = True continue if processed: continue for w_ts, w_value, w_algo in anomalies: if w_ts == ts: algorithms_triggered.append(w_algo) # @added 20200421 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Added last_anomaly_timestamp to apply alert_interval against and # alert_interval_discarded_anomalies. If the alert interval is passed append_anomaly = True if algorithms_triggered: consensus = len(algorithms_triggered) # @added 20200421 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Added last_anomaly_timestamp to apply alert_interval against and # alert_interval_discarded_anomalies. If the alert interval is passed # @modified 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms # if consensus >= CONSENSUS: if consensus >= use_consensus: current_anomaly_timestamp = int(ts) if alert_interval and last_anomaly_timestamp: time_between_anomalies = current_anomaly_timestamp - last_anomaly_timestamp if time_between_anomalies < alert_interval: try: discard_anomaly = [ ts, value, consensus, algorithms_triggered ] # This logs a lot if enabled # logger.info('debug :: time_between_anomalies %s is less than alert_interval %s, last_anomaly_timestamp set to %s and current_anomaly_timestamp is %s - discarding %s' % ( # str(time_between_anomalies), str(alert_interval), # str(last_anomaly_timestamp), # str(current_anomaly_timestamp), str(discard_anomaly))) alert_interval_discarded_anomalies.append( discard_anomaly) append_anomaly = False except: logger.error(traceback.format_exc()) logger.error( 'error :: falied to append to alert_interval_discarded_anomalies on %s' % str(timeseries_file)) # @modified 20200421 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Only append if append_anomaly # anomalies_score.append([ts, value, consensus, algorithms_triggered]) if append_anomaly: anomalies_score.append( [ts, value, consensus, algorithms_triggered]) # @modified 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms # if consensus >= CONSENSUS: if consensus >= use_consensus: last_anomaly_timestamp = int(ts) except: logger.error(traceback.format_exc()) logger.error( 'error :: falied to process anomalies entry on %s' % str(timeseries_file)) # @added 20200421 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Added alert_interval_discarded_anomalies if alert_interval: if alert_interval_discarded_anomalies: logger.info( 'info :: discarded %s anomalies due to them being within the alert_interval period on %s' % (str(len(alert_interval_discarded_anomalies)), str(timeseries_file))) else: logger.info( 'info :: no anomalies were discarded due to them being within the alert_interval period on %s' % str(timeseries_file)) try: logger.info( 'info :: plotting skyline.consensus.anomalies.png for %s' % str(timeseries_file)) x_vals = np.arange(len(timeseries)) y_vals = np.array([y[1] for y in timeseries]) # Match default graphite graph size plt.figure(figsize=(5.86, 3.08), dpi=100) plt.plot(x_vals, y_vals) for index in range(10, len(timeseries)): anomaly = False sliced = timeseries[:index] for i in anomalies_score: if sliced[-1][0] == i[0]: # @modified 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms # if i[2] >= CONSENSUS: if i[2] >= use_consensus: anomaly = True # Point out the datapoint if it is anomalous according to # Skyline CONSENSUS if anomaly: plt.plot([index], [sliced[-1][1]], 'ro') results_filename = join(results_dir + "/skyline.consensus.anomalies.png") plt.savefig(results_filename, dpi=100) if python_version == 2: # @modified 20200327 - Branch #3262: py3 # os.chmod(results_filename, 0644) os.chmod(results_filename, 0o644) if python_version == 3: os.chmod(results_filename, mode=0o644) except: logger.error('error :: %s' % (traceback.format_exc())) logger.error( 'error :: failed plotting skyline.consensus.anomalies.png for %s' % str(timeseries_file)) anomalies_filename = join(results_dir + "/skyline.anomalies_score.txt") try: logger.info('info :: creating anomalies_filename - %s for %s' % (anomalies_filename, str(timeseries_file))) write_data_to_file(skyline_app, anomalies_filename, 'w', str(anomalies_score)) except: logger.error('error :: %s' % (traceback.format_exc())) logger.error( 'error :: failed creating anomalies_filename - %s for %s' % (anomalies_filename, str(timeseries_file))) anomalies_csv = join(results_dir + "/skyline.anomalies.csv") logger.info('info :: creating anomalies_csv - %s for %s' % (anomalies_csv, str(timeseries_file))) try: with open(anomalies_csv, 'w') as fh: fh.write( 'timstamp,value,consensus_count,triggered_algorithms\n') for ts, value, consensus, algorithms_triggered in anomalies_score: try: algos_str = str(algorithms_triggered) triggered_algorithms = algos_str.replace(',', ' ') line = '%s,%s,%s,%s\n' % (str(ts), str(value), str(consensus), str(triggered_algorithms)) with open(anomalies_csv, 'a') as fh: fh.write(line) except: logger.error(traceback.format_exc()) logger.error('error :: could not write to file %s for %s' % (anomalies_csv, str(timeseries_file))) if python_version == 2: # @modified 20200327 - Branch #3262: py3 # os.chmod(anomalies_csv, 0644) os.chmod(anomalies_csv, 0o644) if python_version == 3: os.chmod(anomalies_csv, mode=0o644) except: logger.error(traceback.format_exc()) logger.error('error :: could not write to file %s for %s' % (anomalies_csv, str(timeseries_file))) logger.info('info :: created anomalies_csv OK for %s' % str(timeseries_file)) # @added 20200421 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Added alert_interval_discarded_anomalies alert_interval_discarded_anomalies_count = len( alert_interval_discarded_anomalies) if alert_interval_discarded_anomalies: alert_interval_discarded_anomalies_csv = join( results_dir + '/skyline.alert_interval_discarded_anomalies.csv') logger.info( 'info :: writing %s alert_interval discarded anomalies to %s for %s' % (str(len(alert_interval_discarded_anomalies)), alert_interval_discarded_anomalies_csv, str(timeseries_file))) try: with open(alert_interval_discarded_anomalies_csv, 'w') as fh: fh.write('timstamp,value,consensus,triggered_algorithms\n') for ts, value, consensus, algorithms_triggered in alert_interval_discarded_anomalies: try: line = '%s,%s,%s,%s\n' % (str(ts), str(value), str(consensus), str(algorithms_triggered)) with open(alert_interval_discarded_anomalies_csv, 'a') as fh: fh.write(line) except: logger.error(traceback.format_exc()) logger.error( 'error :: could not write to file %s for %s' % (alert_interval_discarded_anomalies_csv, str(timeseries_file))) if python_version == 2: os.chmod(alert_interval_discarded_anomalies_csv, 0o644) if python_version == 3: os.chmod(alert_interval_discarded_anomalies_csv, mode=0o644) except: logger.error(traceback.format_exc()) logger.error('error :: could not write to file %s for %s' % (alert_interval_discarded_anomalies_csv, str(timeseries_file))) else: logger.info('0 anomalies found for %s' % str(timeseries_file)) return anomalous, triggered_algorithms, alert_interval_discarded_anomalies_count
def alert_smtp(alert, metric, context): """ Called by :func:`~trigger_alert` and sends an alert via smtp to the recipients that are configured for the metric. """ LOCAL_DEBUG = False logger = logging.getLogger(skyline_app_logger) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - sending smtp alert') logger.info('debug :: alert_smtp - Memory usage at start: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # FULL_DURATION to hours so that analyzer surfaces the relevant timeseries data # in the graph full_duration_in_hours = int(settings.FULL_DURATION) / 3600 # @added 20161229 - Feature #1830: Ionosphere alerts # Added Ionosphere variables base_name = str(metric[1]).replace(settings.FULL_NAMESPACE, '', 1) if settings.IONOSPHERE_ENABLED: timeseries_dir = base_name.replace('.', '/') training_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER, str(int(metric[2])), timeseries_dir) graphite_image_file = '%s/%s.%s.graphite.%sh.png' % ( training_data_dir, base_name, skyline_app, str(int(full_duration_in_hours))) json_file = '%s/%s.%s.redis.%sh.json' % ( training_data_dir, base_name, skyline_app, str(int(full_duration_in_hours))) training_data_redis_image = '%s/%s.%s.redis.plot.%sh.png' % ( training_data_dir, base_name, skyline_app, str(int(full_duration_in_hours))) # For backwards compatibility if '@' in alert[1]: sender = settings.ALERT_SENDER recipient = alert[1] else: sender = settings.SMTP_OPTS['sender'] # @modified 20160806 - Added default_recipient try: recipients = settings.SMTP_OPTS['recipients'][alert[0]] use_default_recipient = False except: use_default_recipient = True if use_default_recipient: try: recipients = settings.SMTP_OPTS['default_recipient'] logger.info( 'alert_smtp - using default_recipient as no recipients are configured for %s' % str(alert[0])) except: logger.error( 'error :: alert_smtp - no known recipient for %s' % str(alert[0])) return False # Backwards compatibility if type(recipients) is str: recipients = [recipients] # @added 20180524 - Task #2384: Change alerters to cc other recipients # The alerters did send an individual email to each recipient. This would be # more useful if one email was sent with the first smtp recipient being the # to recipient and the subsequent recipients were add in cc. if recipients: primary_recipient = False cc_recipients = False for i_recipient in recipients: if not primary_recipient: primary_recipient = str(i_recipient) if primary_recipient != i_recipient: if not cc_recipients: cc_recipients = str(i_recipient) else: new_cc_recipients = '%s,%s' % (str(cc_recipients), str(i_recipient)) cc_recipients = str(new_cc_recipients) logger.info( 'alert_smtp - will send to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) # @modified 20161229 - Feature #1830: Ionosphere alerts # Ionosphere alerts unencoded_graph_title = 'Skyline %s - ALERT at %s hours - %s' % ( context, str(int(full_duration_in_hours)), str(metric[0])) # @modified 20170603 - Feature #2034: analyse_derivatives # Added deriative functions to convert the values of metrics strictly # increasing monotonically to their deriative products in alert graphs and # specify it in the graph_title known_derivative_metric = False try: # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow if settings.REDIS_PASSWORD: REDIS_ALERTER_CONN = redis.StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: REDIS_ALERTER_CONN = redis.StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) except: logger.error(traceback.format_exc()) logger.error('error :: alert_smtp - redis connection failed') try: derivative_metrics = list( REDIS_ALERTER_CONN.smembers('derivative_metrics')) except: derivative_metrics = [] redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(base_name)) if redis_metric_name in derivative_metrics: known_derivative_metric = True if known_derivative_metric: try: non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS except: non_derivative_monotonic_metrics = [] skip_derivative = in_list(redis_metric_name, non_derivative_monotonic_metrics) if skip_derivative: known_derivative_metric = False if known_derivative_metric: unencoded_graph_title = 'Skyline %s - ALERT at %s hours - derivative graph - %s' % ( context, str(int(full_duration_in_hours)), str(metric[0])) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - unencoded_graph_title: %s' % unencoded_graph_title) graph_title_string = quote(unencoded_graph_title, safe='') graph_title = '&title=%s' % graph_title_string graphite_port = '80' if settings.GRAPHITE_PORT != '': graphite_port = str(settings.GRAPHITE_PORT) link = '%s://%s:%s/render/?from=-%shours&target=cactiStyle(%s)%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port, str(int(full_duration_in_hours)), metric[1], settings.GRAPHITE_GRAPH_SETTINGS, graph_title) # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: link = '%s://%s:%s/render/?from=-%shours&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port, str(int(full_duration_in_hours)), metric[1], settings.GRAPHITE_GRAPH_SETTINGS, graph_title) content_id = metric[1] image_data = None if settings.SMTP_OPTS.get('embed-images'): # @added 20161229 - Feature #1830: Ionosphere alerts # Use existing data if files exist if os.path.isfile(graphite_image_file): try: with open(graphite_image_file, 'r') as f: image_data = f.read() logger.info('alert_smtp - using existing png - %s' % graphite_image_file) except: logger.error(traceback.format_exc()) logger.error( 'error :: alert_smtp - failed to read image data from existing png - %s' % graphite_image_file) logger.error('error :: alert_smtp - %s' % str(link)) image_data = None if image_data is None: try: # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests image_data = urllib2.urlopen(link).read() # nosec if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - image data OK') except urllib2.URLError: logger.error(traceback.format_exc()) logger.error('error :: alert_smtp - failed to get image graph') logger.error('error :: alert_smtp - %s' % str(link)) image_data = None if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - image data None') if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after image_data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # If we failed to get the image or if it was explicitly disabled, # use the image URL instead of the content. if image_data is None: img_tag = '<img src="%s"/>' % link else: img_tag = '<img src="cid:%s"/>' % content_id if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - img_tag: %s' % img_tag) if settings.IONOSPHERE_ENABLED: # Create Ionosphere Graphite image # @modified 20161229 - Feature #1830: Ionosphere alerts # Only write the data to the file if it does not exist if not os.path.isfile(graphite_image_file): try: write_data_to_file(skyline_app, graphite_image_file, 'w', image_data) logger.info('added %s Ionosphere Graphite image :: %s' % (skyline_app, graphite_image_file)) except: logger.info(traceback.format_exc()) logger.error( 'error :: failed to add %s Ionosphere Graphite image' % (skyline_app, graphite_image_file)) else: logger.info( '%s Ionosphere Graphite image already exists :: %s' % (skyline_app, graphite_image_file)) redis_image_data = None try: plot_redis_data = settings.PLOT_REDIS_DATA except: plot_redis_data = False if settings.SMTP_OPTS.get('embed-images') and plot_redis_data: # Create graph from Redis data redis_metric_key = '%s%s' % (settings.FULL_NAMESPACE, metric[1]) try: raw_series = REDIS_ALERTER_CONN.get(redis_metric_key) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - raw_series: %s' % 'OK') except: if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - raw_series: %s' % 'FAIL') try: if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before get Redis timeseries data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) unpacker = Unpacker(use_list=True) unpacker.feed(raw_series) timeseries_x = [float(item[0]) for item in unpacker] unpacker = Unpacker(use_list=True) unpacker.feed(raw_series) timeseries_y = [item[1] for item in unpacker] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after get Redis timeseries data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) except: logger.error('error :: alert_smtp - unpack timeseries failed') timeseries = None if settings.IONOSPHERE_ENABLED and timeseries: ''' .. todo: this is possibly to be used to allow the user to submit the FULL_DURATION duration data set for the features profile to be created against IF it is a Mirage metric. This would allow for additional granularity in Mirage metrics, thereby maintaining their seasonality, but allow user and Skyline to analyze the anomaly at a FULL_DURATION resolution as well. Not sure how to code that in Ionosphere context yet but could just be additonal flag in the Ionosphere record. In the Ionosphere frontend, the user would be given an option to either create the features profile on the Mirage timeseries or the redis FULL_DURATION timeseries. It is a little complicated, but doable. # @modified 20161229 - Feature #1828: ionosphere - mirage Redis data features However that ^^ is UNDESIRABLE in the Mirage/Ionosphere context at the moment. Ionosphere must only profile SECOND_ORDER_RESOLUTION_HOURS currently so as to not pollute the seasonality aspect of Mirage ''' # Create Ionosphere redis timeseries json if is does not exist # @modified 20161229 - Feature #1830: Ionosphere alerts # Only write the data to the file if it does not exist and replace # the timeseries object if a json file exists # @added 20170920 - Bug #2168: Strange Redis derivative graph using_original_redis_json = False if not os.path.isfile(json_file): timeseries_json = str(timeseries).replace('[', '(').replace( ']', ')') try: write_data_to_file(skyline_app, json_file, 'w', timeseries_json) logger.info( 'added %s Ionosphere Redis data timeseries json file :: %s' % (skyline_app, json_file)) except: logger.info(traceback.format_exc()) logger.error( 'error :: failed to add %s Ionosphere Redis data timeseries json file' % (skyline_app, json_file)) else: # Replace the timeseries object logger.info( '%s Ionosphere Redis data timeseries json file already exists, using :: %s' % (skyline_app, json_file)) anomaly_json = json_file try: # Read the timeseries json file with open(anomaly_json, 'r') as f: raw_timeseries = f.read() timeseries_array_str = str(raw_timeseries).replace( '(', '[').replace(')', ']') timeseries = literal_eval(timeseries_array_str) logger.info( '%s Redis timeseries replaced with timeseries from :: %s' % (skyline_app, anomaly_json)) timeseries_x = [float(item[0]) for item in timeseries] timeseries_y = [item[1] for item in timeseries] # @added 20170920 - Bug #2168: Strange Redis derivative graph # This already has nonNegativeDerivative applied to it using_original_redis_json = True except: logger.error(traceback.format_exc()) logger.error( 'error :: %s failed to read timeseries data from %s' % (skyline_app, anomaly_json)) timeseries = None # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: # @added 20170920 - Bug #2168: Strange Redis derivative graph # If this is the Mirage Redis json it already has # nonNegativeDerivative applied to it if not using_original_redis_json: logger.info('alert_smtp - nonNegativeDerivative being applied') try: derivative_timeseries = nonNegativeDerivative(timeseries) timeseries = derivative_timeseries # @added 20170920 - Bug #2168: Strange Redis derivative graph logger.info('alert_smtp - nonNegativeDerivative applied') except: logger.error( 'error :: alert_smtp - nonNegativeDerivative failed') else: logger.info( 'alert_smtp - nonNegativeDerivative not being applied, as it will have been applied in the original json' ) # @added 21070726 - Bug #2068: Analyzer smtp alert error on Redis plot with derivative metrics # If the nonNegativeDerivative has been calculated we need to reset the # x and y as nonNegativeDerivative has to discard the first value as it # has no delta for it so the timeseries is 1 item less. timeseries_x = [float(item[0]) for item in timeseries] timeseries_y = [item[1] for item in timeseries] pd_series_values = None if timeseries: try: if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before pd.Series: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) values = pd.Series([x[1] for x in timeseries]) # Because the truth value of a Series is ambiguous pd_series_values = True if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after pd.Series: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) except: logger.error( 'error :: alert_smtp - pandas value series on timeseries failed' ) if pd_series_values: try: array_median = np.median(values) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - values median: %s' % str(array_median)) array_amax = np.amax(values) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - array_amax: %s' % str(array_amax)) array_amin = np.amin(values) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - array_amin: %s' % str(array_amin)) mean = values.mean() if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - mean: %s' % str(mean)) stdDev = values.std() if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - stdDev: %s' % str(stdDev)) sigma3 = 3 * stdDev if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - sigma3: %s' % str(sigma3)) # sigma3_series = [sigma3] * len(values) sigma3_upper_bound = mean + sigma3 try: sigma3_lower_bound = mean - sigma3 except: sigma3_lower_bound = 0 sigma3_upper_series = [sigma3_upper_bound] * len(values) sigma3_lower_series = [sigma3_lower_bound] * len(values) amax_series = [array_amax] * len(values) amin_series = [array_amin] * len(values) mean_series = [mean] * len(values) except: logger.error( 'error :: alert_smtp - numpy ops on series failed') mean_series = None if mean_series: graph_title = 'Skyline %s - ALERT - at %s hours - Redis data\n%s - anomalous value: %s' % ( context, str( int(full_duration_in_hours)), metric[1], str(metric[0])) # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: graph_title = 'Skyline %s - ALERT - at %s hours - Redis data (derivative graph)\n%s - anomalous value: %s' % ( context, str(int(full_duration_in_hours)), metric[1], str(metric[0])) # @modified 20160814 - Bug #1558: Memory leak in Analyzer # I think the buf is causing a memory leak, trying a file # if python_version == 3: # buf = io.StringIO() # else: # buf = io.BytesIO() buf = '%s/%s.%s.%s.png' % (settings.SKYLINE_TMP_DIR, skyline_app, str(int(metric[2])), metric[1]) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before plot Redis data: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # Too big # rcParams['figure.figsize'] = 12, 6 rcParams['figure.figsize'] = 8, 4 try: # fig = plt.figure() fig = plt.figure(frameon=False) ax = fig.add_subplot(111) ax.set_title(graph_title, fontsize='small') # @modified 20180417 - Bug #2358: set_axis_bgcolor method removed from Matplotlib - Luminosity # IssueID #49 'AxesSubplot' object has no attribute 'set_axis_bgcolor' # ax.set_axis_bgcolor('black') if hasattr(ax, 'set_facecolor'): ax.set_facecolor('black') else: ax.set_axis_bgcolor('black') try: datetimes = [ dt.datetime.utcfromtimestamp(ts) for ts in timeseries_x ] if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - datetimes: %s' % 'OK') except: logger.error('error :: alert_smtp - datetimes: %s' % 'FAIL') plt.xticks(rotation=0, horizontalalignment='center') xfmt = DateFormatter('%a %H:%M') plt.gca().xaxis.set_major_formatter(xfmt) ax.xaxis.set_major_formatter(xfmt) ax.plot(datetimes, timeseries_y, color='orange', lw=0.6, zorder=3) ax.tick_params(axis='both', labelsize='xx-small') max_value_label = 'max - %s' % str(array_amax) ax.plot(datetimes, amax_series, lw=1, label=max_value_label, color='m', ls='--', zorder=4) min_value_label = 'min - %s' % str(array_amin) ax.plot(datetimes, amin_series, lw=1, label=min_value_label, color='b', ls='--', zorder=4) mean_value_label = 'mean - %s' % str(mean) ax.plot(datetimes, mean_series, lw=1.5, label=mean_value_label, color='g', ls='--', zorder=4) sigma3_text = (r'3$\sigma$') # sigma3_label = '%s - %s' % (str(sigma3_text), str(sigma3)) sigma3_upper_label = '%s upper - %s' % ( str(sigma3_text), str(sigma3_upper_bound)) ax.plot(datetimes, sigma3_upper_series, lw=1, label=sigma3_upper_label, color='r', ls='solid', zorder=4) if sigma3_lower_bound > 0: sigma3_lower_label = '%s lower - %s' % ( str(sigma3_text), str(sigma3_lower_bound)) ax.plot(datetimes, sigma3_lower_series, lw=1, label=sigma3_lower_label, color='r', ls='solid', zorder=4) ax.get_yaxis().get_major_formatter().set_useOffset(False) ax.get_yaxis().get_major_formatter().set_scientific(False) # Shrink current axis's height by 10% on the bottom box = ax.get_position() ax.set_position([ box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9 ]) # Put a legend below current axis ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=4, fontsize='x-small') plt.rc('lines', lw=2, color='w') plt.grid(True) ax.grid(b=True, which='both', axis='both', color='lightgray', linestyle='solid', alpha=0.5, linewidth=0.6) # @modified 20180417 - Bug #2358: set_axis_bgcolor method removed from Matplotlib - Luminosity # IssueID #49 'AxesSubplot' object has no attribute 'set_axis_bgcolor' # ax.set_axis_bgcolor('black') if hasattr(ax, 'set_facecolor'): ax.set_facecolor('black') else: ax.set_axis_bgcolor('black') rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' ax.margins(y=.02, x=.03) # tight_layout removes the legend box # fig.tight_layout() try: if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before plt.savefig: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) plt.savefig(buf, format='png') if settings.IONOSPHERE_ENABLED: if not os.path.exists(training_data_dir): mkdir_p(training_data_dir) logger.info('created dir - %s' % training_data_dir) if not os.path.isfile(training_data_redis_image): try: plt.savefig(training_data_redis_image, format='png') logger.info( 'alert_smtp - save Redis training data image - %s' % (training_data_redis_image)) except: logger.info(traceback.format_exc()) logger.error( 'error :: alert_smtp - could not save - %s' % (training_data_redis_image)) else: logger.info( 'alert_smtp - Redis training data image already exists - %s' % (training_data_redis_image)) # @added 20160814 - Bug #1558: Memory leak in Analyzer # As per http://www.mail-archive.com/[email protected]/msg13222.html # savefig in the parent process was causing the memory leak # the below fig.clf() and plt.close() did not resolve this # however spawing a multiprocessing process for alert_smtp # does solve this as issue as all memory is freed when the # process terminates. fig.clf() plt.close(fig) redis_graph_content_id = 'redis.%s' % metric[1] redis_image_data = True if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - savefig: %s' % 'OK') logger.info( 'debug :: alert_smtp - Memory usage after plt.savefig: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) except: logger.info(traceback.format_exc()) logger.error('error :: alert_smtp - plt.savefig: %s' % 'FAIL') except: logger.error(traceback.format_exc()) logger.error('error :: alert_smtp - could not build plot') if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before email: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if redis_image_data: redis_img_tag = '<img src="cid:%s"/>' % redis_graph_content_id if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info('debug :: alert_smtp - redis_img_tag: %s' % str(redis_img_tag)) else: # @modified 20161229 - Feature #1830: Ionosphere alerts # @modified 20170108 - Feature #1852: Ionosphere - features_profile matched graphite graphs # Restored the previous redis_img_tag method as some smtp alerts were # coming without a Redis graph, not all but some and for some reason, # I am pretty certain retrospectively that it was done that way from # testing I just wanted to try and be cleaner. # The redis_img_tag was changed at # https://github.com/earthgecko/skyline/commit/31bcacf3f90f0953ebed0d57260cb937e01f887c#diff-520bf2a218f65074ffead4d8184c138dR489 redis_img_tag = '<img src="%s"/>' % 'none' # redis_img_tag = '<img src="none"/>' # @added 20170806 - Feature #1830: Ionosphere alerts # Show a human date in alerts alerted_at = str(dt.datetime.utcfromtimestamp(int(metric[2]))) try: body = '<h3><font color="#dd3023">Sky</font><font color="#6698FF">line</font><font color="black"> %s alert</font></h3><br>' % context body += '<font color="black">metric: <b>%s</b></font><br>' % metric[1] body += '<font color="black">Anomalous value: %s</font><br>' % str( metric[0]) body += '<font color="black">Anomaly timestamp: %s</font><br>' % str( int(metric[2])) # @added 20170806 - Feature #1830: Ionosphere alerts # Show a human date in alerts body += '<font color="black">Anomalous at: %s</font><br>' % alerted_at body += '<font color="black">At hours: %s</font><br>' % str( int(full_duration_in_hours)) body += '<font color="black">Next alert in: %s seconds</font><br>' % str( alert[2]) # @added 20170603 - Feature #2034: analyse_derivatives if known_derivative_metric: body += '<font color="black">Derivative graph: True</font><br>' more_body = '' if settings.IONOSPHERE_ENABLED: # @modified 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls # Broke body into body and more_body to workaround the 990 character # limit per line for SMTP more_body += '<h3><font color="#dd3023">Ionosphere :: </font><font color="#6698FF">training data</font><font color="black"></font></h3>' ionosphere_link = '%s/ionosphere?timestamp=%s&metric=%s' % ( settings.SKYLINE_URL, str(int(metric[2])), str(metric[1])) more_body += '<font color="black">To use this timeseries to train Skyline that this is not anomalous manage this training data at:<br>' more_body += '<a href="%s">%s</a></font>' % (ionosphere_link, ionosphere_link) if redis_image_data: more_body += '<font color="black">min: %s | max: %s | mean: %s <br>' % ( str(array_amin), str(array_amax), str(mean)) more_body += '3-sigma: %s <br>' % str(sigma3) more_body += '3-sigma upper bound: %s | 3-sigma lower bound: %s <br></font>' % ( str(sigma3_upper_bound), str(sigma3_lower_bound)) more_body += '<h3><font color="black">Redis data at FULL_DURATION</font></h3><br>' more_body += '<div dir="ltr">:%s<br></div>' % redis_img_tag if image_data: more_body += '<h3><font color="black">Graphite data at FULL_DURATION (may be aggregated)</font></h3>' more_body += '<div dir="ltr"><a href="%s">%s</a><br></div><br>' % ( link, img_tag) more_body += '<font color="black">Clicking on the above graph will open to the Graphite graph with current data</font><br>' if redis_image_data: more_body += '<font color="black">To disable the Redis data graph view, set PLOT_REDIS_DATA to False in your settings.py, if the Graphite graph is sufficient for you,<br>' more_body += 'however do note that will remove the 3-sigma and mean value too.</font>' more_body += '<br>' more_body += '<div dir="ltr" align="right"><font color="#dd3023">Sky</font><font color="#6698FF">line</font><font color="black"> version :: %s</font></div><br>' % str( skyline_version) except: logger.error('error :: alert_smtp - could not build body') logger.info(traceback.format_exc()) # @modified 20180524 - Task #2384: Change alerters to cc other recipients # Do not send to each recipient, send to primary_recipient and cc the other # recipients, thereby sending only one email # for recipient in recipients: if primary_recipient: try: # @modified 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls # Broke body into body and more_body to workaround the 990 character # limit per line for SMTP, using mixed as alternative indicates that # the client should select one of the parts for display and ignore # the rest (tripleee - https://stackoverflow.com/a/35115938) # msg = MIMEMultipart('alternative') msg = MIMEMultipart('mixed') # @added 20170812 - Bug #2142: 7bit SMTP encoding breaking long urls # set email charset and email encodings cs_ = charset.Charset('utf-8') cs_.header_encoding = charset.QP cs_.body_encoding = charset.QP msg.set_charset(cs_) msg['Subject'] = '[Skyline alert] - %s ALERT - %s' % (context, metric[1]) msg['From'] = sender # @modified 20180524 - Task #2384: Change alerters to cc other recipients # msg['To'] = recipient msg['To'] = primary_recipient # @added 20180524 - Task #2384: Change alerters to cc other recipients # Added Cc if cc_recipients: msg['Cc'] = cc_recipients msg.attach(MIMEText(body, 'html')) # @added 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls # Broke body into body and more_body to workaround the 990 character # limit per line for SMTP msg.replace_header('content-transfer-encoding', 'quoted-printable') msg.attach(MIMEText(more_body, 'html')) if redis_image_data: try: # @modified 20160814 - Bug #1558: Memory leak in Analyzer # I think the buf is causing a memory leak, trying a file # buf.seek(0) # msg_plot_attachment = MIMEImage(buf.read()) # msg_plot_attachment = MIMEImage(buf.read()) try: with open(buf, 'r') as f: plot_image_data = f.read() try: os.remove(buf) except OSError: logger.error( 'error :: alert_smtp - failed to remove file - %s' % buf) logger.info(traceback.format_exc()) pass except: logger.error('error :: failed to read plot file - %s' % buf) plot_image_data = None # @added 20161124 - Branch #922: ionosphere msg_plot_attachment = MIMEImage(plot_image_data) msg_plot_attachment.add_header( 'Content-ID', '<%s>' % redis_graph_content_id) msg.attach(msg_plot_attachment) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - msg_plot_attachment - redis data done' ) except: logger.error('error :: alert_smtp - msg_plot_attachment') logger.info(traceback.format_exc()) if image_data is not None: try: msg_attachment = MIMEImage(image_data) msg_attachment.add_header('Content-ID', '<%s>' % content_id) msg.attach(msg_attachment) if settings.ENABLE_DEBUG or LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - msg_attachment - Graphite img source done' ) except: logger.error('error :: alert_smtp - msg_attachment') logger.info(traceback.format_exc()) except: logger.error('error :: alert_smtp - could not attach') logger.info(traceback.format_exc()) s = SMTP('127.0.0.1') try: # @modified 20180524 - Task #2384: Change alerters to cc other recipients # Send to primary_recipient and cc_recipients # s.sendmail(sender, recipient, msg.as_string()) if cc_recipients: s.sendmail(sender, [primary_recipient, cc_recipients], msg.as_string()) else: s.sendmail(sender, primary_recipient, msg.as_string()) if settings.ENABLE_DEBUG or LOCAL_DEBUG: # logger.info('debug :: alert_smtp - message sent to %s OK' % str(recipient)) logger.info( 'debug :: alert_smtp - message sent OK to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) except: logger.info(traceback.format_exc()) # logger.error('error :: alert_smtp - could not send email to %s' % str(recipient)) logger.error( 'error :: alert_smtp - could not send email to primary_recipient :: %s, cc_recipients :: %s' % (str(primary_recipient), str(cc_recipients))) s.quit() if LOCAL_DEBUG: logger.info('debug :: alert_smtp - Memory usage after email: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if redis_image_data: # buf.seek(0) # buf.write('none') if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before del redis_image_data objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) del raw_series del unpacker del timeseries[:] del timeseries_x[:] del timeseries_y[:] del values del datetimes[:] del msg_plot_attachment del redis_image_data # We del all variables that are floats as they become unique objects and # can result in what appears to be a memory leak, but is not, it is # just the way Python handles floats del mean del array_amin del array_amax del stdDev del sigma3 if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after del redis_image_data objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before del fig object: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # @added 20160814 - Bug #1558: Memory leak in Analyzer # Issue #21 Memory leak in Analyzer - https://github.com/earthgecko/skyline/issues/21 # As per http://www.mail-archive.com/[email protected]/msg13222.html fig.clf() plt.close(fig) del fig if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after del fig object: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage before del other objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) del recipients[:] del body del msg del image_data del msg_attachment if LOCAL_DEBUG: logger.info( 'debug :: alert_smtp - Memory usage after del other objects: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) return
def spin_process(self, i, unique_metrics): """ Assign a bunch of metrics for a process to analyze. Multiple get the assigned_metrics to the process from Redis. For each metric: - unpack the `raw_timeseries` for the metric. - Analyse each timeseries against `ALGORITHMS` to determine if it is anomalous. - If anomalous add it to the :obj:`self.anomalous_metrics` list - Add what algorithms triggered to the :obj:`self.anomaly_breakdown_q` queue - If :mod:`settings.ENABLE_CRUCIBLE` is ``True``: - Add a crucible data file with the details about the timeseries and anomaly. - Write the timeseries to a json file for crucible. Add keys and values to the queue so the parent process can collate for:\n * :py:obj:`self.anomaly_breakdown_q` * :py:obj:`self.exceptions_q` """ spin_start = time() logger.info('spin_process started') # Discover assigned metrics keys_per_processor = int(ceil(float(len(unique_metrics)) / float(settings.ANALYZER_PROCESSES))) if i == settings.ANALYZER_PROCESSES: assigned_max = len(unique_metrics) else: assigned_max = min(len(unique_metrics), i * keys_per_processor) # Fix analyzer worker metric assignment #94 # https://github.com/etsy/skyline/pull/94 @languitar:worker-fix assigned_min = (i - 1) * keys_per_processor assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics = [unique_metrics[index] for index in assigned_keys] # Check if this process is unnecessary if len(assigned_metrics) == 0: return # Multi get series raw_assigned = self.redis_conn.mget(assigned_metrics) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Distill timeseries strings into lists for i, metric_name in enumerate(assigned_metrics): self.check_if_parent_is_alive() try: raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_name) # If it's anomalous, add it to list if anomalous: base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) metric = [datapoint, base_name] self.anomalous_metrics.append(metric) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: algorithm = settings.ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # If Crucible or Panorama are enabled determine details determine_anomaly_details = False if settings.ENABLE_CRUCIBLE and settings.ANALYZER_CRUCIBLE_ENABLED: determine_anomaly_details = True if settings.PANORAMA_ENABLED: determine_anomaly_details = True if determine_anomaly_details: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') # If Panorama is enabled - create a Panorama check if settings.PANORAMA_ENABLED: if not os.path.exists(settings.PANORAMA_CHECK_PATH): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(settings.PANORAMA_CHECK_PATH, mode_arg) # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used results in a shift in the # decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 added_at = str(int(time())) source = 'graphite' panaroma_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'app = \'%s\'\n' \ 'source = \'%s\'\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.ALGORITHMS), triggered_algorithms, skyline_app, source, this_host, added_at) # Create an anomaly file with details about the anomaly panaroma_anomaly_file = '%s/%s.%s.txt' % ( settings.PANORAMA_CHECK_PATH, added_at, base_name) try: write_data_to_file( skyline_app, panaroma_anomaly_file, 'w', panaroma_anomaly_data) logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file)) except: logger.error('error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file)) logger.info(traceback.format_exc()) # If Crucible is enabled - save timeseries and create a # Crucible check if settings.ENABLE_CRUCIBLE and settings.ANALYZER_CRUCIBLE_ENABLED: crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp if not os.path.exists(crucible_anomaly_dir): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(crucible_anomaly_dir, mode_arg) # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used in crucible results in a # shift in the decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 crucible_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'anomaly_dir = \'%s\'\n' \ 'graphite_metric = True\n' \ 'run_crucible_tests = False\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.ALGORITHMS), triggered_algorithms, crucible_anomaly_dir, skyline_app, metric_timestamp) # Create an anomaly file with details about the anomaly crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, base_name) try: write_data_to_file( skyline_app, crucible_anomaly_file, 'w', crucible_anomaly_data) logger.info('added crucible anomaly file :: %s' % (crucible_anomaly_file)) except: logger.error('error :: failed to add crucible anomaly file :: %s' % (crucible_anomaly_file)) logger.info(traceback.format_exc()) # Create timeseries json file with the timeseries json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name) timeseries_json = str(timeseries).replace('[', '(').replace(']', ')') try: write_data_to_file(skyline_app, json_file, 'w', timeseries_json) logger.info('added crucible timeseries file :: %s' % (json_file)) except: logger.error('error :: failed to add crucible timeseries file :: %s' % (json_file)) logger.info(traceback.format_exc()) # Create a crucible check file crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name) try: write_data_to_file( skyline_app, crucible_check_file, 'w', crucible_anomaly_data) logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp)) except: logger.error('error :: failed to add crucible check file :: %s' % (crucible_check_file)) logger.info(traceback.format_exc()) # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 except TooShort: exceptions['TooShort'] += 1 except Stale: exceptions['Stale'] += 1 except Boring: exceptions['Boring'] += 1 except: exceptions['Other'] += 1 logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.exceptions_q.put((key, value)) spin_end = time() - spin_start logger.info('spin_process took %.2f seconds' % spin_end)
def alert_slack(datapoint, metric_name, expiration_time, metric_trigger, algorithm): if not settings.SLACK_ENABLED: return False from slackclient import SlackClient metric = metric_name logger.info('alert_slack - anomalous metric :: metric: %s - %s' % (metric, algorithm)) base_name = metric alert_algo = str(algorithm) alert_context = alert_algo.upper() # The known_derivative_metric state is determine in case we need to surface # the png image from Graphite if the Ionosphere image is not available for # some reason. This will result in Skyline at least still sending an alert # to slack, even if some gear fails in Ionosphere or slack alerting is used # without Ionosphere enabled. Yes not DRY but multiprocessing and spawn # safe. known_derivative_metric = False # try: # if settings.REDIS_PASSWORD: # # @modified 20191022 - Bug #3266: py3 Redis binary objects not strings # # Branch #3262: py3 # # REDIS_ALERTER_CONN = redis.StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # REDIS_ALERTER_CONN = redis.StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True) # else: # # REDIS_ALERTER_CONN = redis.StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # REDIS_ALERTER_CONN = redis.StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True) # except: # logger.error('error :: alert_slack - redis connection failed') # try: # derivative_metrics = list(REDIS_ALERTER_CONN.smembers('derivative_metrics')) # except: # derivative_metrics = [] redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(base_name)) # if redis_metric_name in derivative_metrics: # known_derivative_metric = True known_derivative_metric = is_derivative_metric(skyline_app, str(base_name)) # if known_derivative_metric: # try: # non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS # except: # non_derivative_monotonic_metrics = [] # skip_derivative = in_list(redis_metric_name, non_derivative_monotonic_metrics) # if skip_derivative: # known_derivative_metric = False # @added 20191008 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings try: main_alert_title = settings.CUSTOM_ALERT_OPTS['main_alert_title'] except: main_alert_title = 'Skyline' try: app_alert_context = settings.CUSTOM_ALERT_OPTS[ 'boundary_alert_heading'] except: app_alert_context = 'Boundary' if known_derivative_metric: # @modified 20191008 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings # unencoded_graph_title = 'Skyline Boundary - ALERT %s at %s hours - derivative graph - %s' % ( # alert_context, str(graphite_previous_hours), metric) # slack_title = '*Skyline Boundary - ALERT* %s on %s at %s hours - derivative graph - %s' % ( # alert_context, metric, str(graphite_previous_hours), datapoint) unencoded_graph_title = '%s %s - ALERT %s at %s hours - derivative graph - %s' % ( main_alert_title, app_alert_context, alert_context, str(graphite_previous_hours), metric) slack_title = '*%s %s - ALERT* %s on %s at %s hours - derivative graph - %s' % ( main_alert_title, app_alert_context, alert_context, metric, str(graphite_previous_hours), datapoint) else: # unencoded_graph_title = 'Skyline Boundary - ALERT %s at %s hours - %s' % ( # alert_context, str(graphite_previous_hours), metric) # slack_title = '*Skyline Boundary - ALERT* %s on %s at %s hours - %s' % ( # alert_context, metric, str(graphite_previous_hours), datapoint) unencoded_graph_title = '%s %s - ALERT %s at %s hours - %s' % ( main_alert_title, app_alert_context, alert_context, str(graphite_previous_hours), metric) slack_title = '*%s %s - ALERT* %s on %s at %s hours - %s' % ( main_alert_title, app_alert_context, alert_context, metric, str(graphite_previous_hours), datapoint) graph_title_string = quote(unencoded_graph_title, safe='') graph_title = '&title=%s' % graph_title_string until_timestamp = int(time()) target_seconds = int((graphite_previous_hours * 60) * 60) from_timestamp = str(until_timestamp - target_seconds) graphite_from = dt.datetime.fromtimestamp( int(from_timestamp)).strftime('%H:%M_%Y%m%d') logger.info('graphite_from - %s' % str(graphite_from)) graphite_until = dt.datetime.fromtimestamp( int(until_timestamp)).strftime('%H:%M_%Y%m%d') logger.info('graphite_until - %s' % str(graphite_until)) # @added 20181025 - Feature #2618: alert_slack # Added date and time info so you do not have to mouseover the slack # message to determine the time at which the alert came in timezone = strftime("%Z", gmtime()) # @modified 20181029 - Feature #2618: alert_slack # Use the standard UNIX data format # human_anomaly_time = dt.datetime.fromtimestamp(int(until_timestamp)).strftime('%Y-%m-%d %H:%M:%S') human_anomaly_time = dt.datetime.fromtimestamp( int(until_timestamp)).strftime('%c') slack_time_string = '%s %s' % (human_anomaly_time, timezone) # @added 20191106 - Branch #3262: py3 # Branch #3002: docker graphite_port = get_graphite_port(skyline_app) graphite_render_uri = get_graphite_render_uri(skyline_app) graphite_custom_headers = get_graphite_custom_headers(skyline_app) if settings.GRAPHITE_PORT != '': if known_derivative_metric: # @modified 20190520 - Branch #3002: docker # Use GRAPHITE_RENDER_URI # link = '%s://%s:%s/render/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % ( # settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, # settings.GRAPHITE_PORT, str(graphite_from), str(graphite_until), # metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title) # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle # link = '%s://%s:%s/%s/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % ( link = '%s://%s:%s/%s/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s),%%27si%%27)%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, settings.GRAPHITE_PORT, settings.GRAPHITE_RENDER_URI, str(graphite_from), str(graphite_until), metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title) else: # @modified 20190520 - Branch #3002: docker # Use GRAPHITE_RENDER_URI # link = '%s://%s:%s/render/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % ( # settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, # settings.GRAPHITE_PORT, str(graphite_from), str(graphite_until), # metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title) # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle # link = '%s://%s:%s/%s/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % ( link = '%s://%s:%s/%s/?from=%s&until=%s&target=cactiStyle(%s,%%27si%%27)%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, settings.GRAPHITE_PORT, settings.GRAPHITE_RENDER_URI, str(graphite_from), str(graphite_until), metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title) else: if known_derivative_metric: # @modified 20190520 - Branch #3002: docker # Use GRAPHITE_RENDER_URI # link = '%s://%s/render/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % ( # settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, # str(graphite_from), str(graphite_until), metric, # settings.GRAPHITE_GRAPH_SETTINGS, graph_title) # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle # link = '%s://%s/%s/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % ( link = '%s://%s/%s/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s),%%27si%%27)%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, settings.GRAPHITE_RENDER_URI, str(graphite_from), str(graphite_until), metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title) else: # @modified 20190520 - Branch #3002: docker # Use GRAPHITE_RENDER_URI # link = '%s://%s/render/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % ( # settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, # str(graphite_from), str(graphite_until), metric, # settings.GRAPHITE_GRAPH_SETTINGS, graph_title) # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle # link = '%s://%s/%s/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % ( link = '%s://%s/%s/?from=%s&until=%s&target=cactiStyle(%s,%%27si%%27)%s%s&colorList=orange' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, settings.GRAPHITE_RENDER_URI, str(graphite_from), str(graphite_until), metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title) # slack does not allow embedded images, nor will it fetch links behind # authentication so Skyline uploads a png graphite image with the message image_file = None # Fetch the png from Graphite # @modified 20191021 - Task #3290: Handle urllib2 in py3 # Branch #3262: py3 image_file = '%s/%s.%s.graphite.%sh.png' % ( settings.SKYLINE_TMP_DIR, base_name, skyline_app, str(int(graphite_previous_hours))) if python_version == 22: try: # image_data = urllib2.urlopen(link).read() # nosec image_data = None # except urllib2.URLError: except: logger.error(traceback.format_exc()) logger.error('error :: alert_slack - failed to get image graph') logger.error('error :: alert_slack - %s' % str(link)) image_data = None if python_version == 33: try: image_file = '%s/%s.%s.graphite.%sh.png' % ( settings.SKYLINE_TMP_DIR, base_name, skyline_app, str(int(graphite_previous_hours))) # urllib.request.urlretrieve(link, image_file) image_data = 'retrieved' image_data = None except: try: # @added 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle image_data = None original_traceback = traceback.format_exc() if 'cactiStyle' in link: metric_replace = '%s,%%27si%%27' % metric original_link = link link = link.replace(metric, metric_replace) logger.info( 'link replaced with cactiStyle system parameter added - %s' % str(link)) urllib.request.urlretrieve(link, image_file) image_data = 'retrieved' except: new_trackback = traceback.format_exc() logger.error(original_traceback) logger.error( 'error :: boundary_alerters :: alert_slack :: failed to urlopen %s' % str(original_link)) logger.error(new_trackback) logger.error( 'error :: boundary_alerters :: alert_slack :: failed to urlopen with system parameter added %s' % str(link)) image_data = None # @added 20191025 - image_data = get_graphite_graph_image(skyline_app, link, image_file) if image_data == 'disabled_for_testing': image_file = '%s/%s.%s.graphite.%sh.png' % ( settings.SKYLINE_TMP_DIR, base_name, skyline_app, str(int(graphite_previous_hours))) if image_data != 'retrieved': try: write_data_to_file(skyline_app, image_file, 'w', image_data) logger.info('alert_slack - added Graphite image :: %s' % (image_file)) except: logger.info(traceback.format_exc()) logger.error( 'error :: alert_slack - failed to add %s Graphite image' % (image_file)) image_file = None try: filename = os.path.basename(image_file) except: filename = None try: bot_user_oauth_access_token = settings.BOUNDARY_SLACK_OPTS[ 'bot_user_oauth_access_token'] except: logger.error( 'error :: alert_slack - could not determine bot_user_oauth_access_token' ) return False # Allow for absolute path metric namespaces but also allow for and match # match wildcard namepaces if there is not an absolute path metric namespace channels = 'unknown' notify_channels = [] matched_channels = [] try: channels = settings.BOUNDARY_SLACK_OPTS['channels'][metric_name] notify_channels.append(channels) except: for channel in settings.BOUNDARY_SLACK_OPTS['channels']: CHECK_MATCH_PATTERN = channel check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(metric_name) if pattern_match: matched_channels.append(channel) if matched_channels != []: for i_metric_name in matched_channels: channels = settings.BOUNDARY_SLACK_OPTS['channels'][i_metric_name] notify_channels.append(channels) if not notify_channels: logger.error('error :: alert_slack - could not determine channel') return False else: channels = notify_channels try: icon_emoji = settings.BOUNDARY_SLACK_OPTS['icon_emoji'] except: icon_emoji = ':chart_with_upwards_trend:' try: sc = SlackClient(bot_user_oauth_access_token) except: logger.info(traceback.format_exc()) logger.error('error :: alert_slack - could not initiate SlackClient') return False for channel in channels: initial_comment = slack_title + ' :: <' + link + '|graphite image link>\nFor anomaly at ' + slack_time_string try: # slack does not allow embedded images, nor links behind authentication # or color text, so we have jump through all the API hoops to end up # having to upload an image with a very basic message. if os.path.isfile(image_file): slack_file_upload = sc.api_call( 'files.upload', filename=filename, channels=channel, initial_comment=initial_comment, file=open(image_file, 'rb')) if not slack_file_upload['ok']: logger.error( 'error :: alert_slack - failed to send slack message with file upload' ) logger.error( 'error :: alert_slack - slack_file_upload - %s' % str(slack_file_upload)) try: os.remove(image_file) except OSError: logger.error('error - failed to remove %s, continuing' % image_file) pass else: send_text = initial_comment + ' :: error :: there was no graph image to upload' send_message = sc.api_call('chat.postMessage', channel=channel, icon_emoji=icon_emoji, text=send_text) if not send_message['ok']: logger.error( 'error :: alert_slack - failed to send slack message') else: logger.info('alert_slack - sent slack message') except: logger.info(traceback.format_exc()) logger.error('error :: alert_slack - could not upload file') return False
def create_features_profile(current_skyline_app, requested_timestamp, data_for_metric, context, ionosphere_job, fp_parent_id, fp_generation, fp_learn): """ Add a features_profile to the Skyline ionosphere database table. :param current_skyline_app: Skyline app name :param requested_timestamp: The timestamp of the dir that the features profile data is in :param data_for_metric: The base_name of the metric :param context: The context of the caller :param ionosphere_job: The ionosphere_job name related to creation request valid jobs are ``learn_fp_human``, ``learn_fp_generation``, ``learn_fp_learnt`` and ``learn_fp_automatic``. :param fp_parent_id: The id of the parent features profile that this was learnt from, 0 being an original human generated features profile :param fp_generation: The number of generations away for the original human generated features profile, 0 being an original human generated features profile. :param fp_learn: Whether Ionosphere should learn at use_full_duration_days :type current_skyline_app: str :type requested_timestamp: int :type data_for_metric: str :type context: str :type ionosphere_job: str :type fp_parent_id: int :type fp_generation: int :type fp_learn: boolean :return: fp_id, fp_in_successful, fp_exists, fail_msg, traceback_format_exc :rtype: str, boolean, boolean, str, str """ current_skyline_app_logger = current_skyline_app + 'Log' current_logger = logging.getLogger(current_skyline_app_logger) base_name = data_for_metric.replace(settings.FULL_NAMESPACE, '', 1) if context == 'training_data': log_context = 'training data' ionosphere_learn_job = 'learn_fp_human' if context == 'features_profiles': log_context = 'features profile data' # @added 20170113 - Feature #1854: Ionosphere learn if context == 'ionosphere_learn': log_context = 'learn' current_logger.info('create_features_profile :: %s :: requested for %s at %s' % ( context, str(base_name), str(requested_timestamp))) metric_timeseries_dir = base_name.replace('.', '/') if context == 'training_data': metric_training_data_dir = '%s/%s/%s' % ( settings.IONOSPHERE_DATA_FOLDER, str(requested_timestamp), metric_timeseries_dir) if context == 'features_profiles': metric_training_data_dir = '%s/%s/%s' % ( settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir, str(requested_timestamp)) # @added 20170113 - Feature #1854: Ionosphere learn if context == 'ionosphere_learn': # @modified 20170116 - Feature #1854: Ionosphere learn # Allowing ionosphere_learn to create a features profile for a training # data set that it has learnt is not anomalous if ionosphere_job != 'learn_fp_automatic': metric_training_data_dir = '%s/%s/%s' % ( settings.IONOSPHERE_LEARN_FOLDER, str(requested_timestamp), metric_timeseries_dir) else: metric_training_data_dir = '%s/%s/%s' % ( settings.IONOSPHERE_DATA_FOLDER, str(requested_timestamp), metric_timeseries_dir) features_file = '%s/%s.tsfresh.input.csv.features.transposed.csv' % ( metric_training_data_dir, base_name) features_profile_dir = '%s/%s' % ( settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir) ts_features_profile_dir = '%s/%s/%s' % ( settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir, str(requested_timestamp)) features_profile_created_file = '%s/%s.%s.fp.created.txt' % ( metric_training_data_dir, str(requested_timestamp), base_name) features_profile_details_file = '%s/%s.%s.fp.details.txt' % ( metric_training_data_dir, str(requested_timestamp), base_name) anomaly_check_file = '%s/%s.txt' % (metric_training_data_dir, base_name) trace = 'none' fail_msg = 'none' new_fp_id = False calculated_with_tsfresh = False calculated_time = False fcount = None fsum = None # @added 20170104 - Feature #1842: Ionosphere - Graphite now graphs # Added the ts_full_duration parameter so that the appropriate graphs can be # embedded for the user in the training data page ts_full_duration = '0' if context == 'ionosphere_learn': if not path.isfile(features_profile_details_file): current_logger.error('error :: create_features_profile :: no features_profile_details_file - %s' % features_profile_details_file) return 'none', False, False, fail_msg, trace if path.isfile(features_profile_details_file): current_logger.info('create_features_profile :: getting features profile details from from - %s' % features_profile_details_file) # Read the details file with open(features_profile_details_file, 'r') as f: fp_details_str = f.read() fp_details = literal_eval(fp_details_str) calculated_with_tsfresh = fp_details[1] calculated_time = str(fp_details[2]) fcount = str(fp_details[3]) fsum = str(fp_details[4]) try: ts_full_duration = str(fp_details[5]) except: current_logger.error('error :: create_features_profile :: could not determine the full duration from - %s' % features_profile_details_file) ts_full_duration = '0' if context != 'ionosphere_learn': if ts_full_duration == '0': if path.isfile(anomaly_check_file): current_logger.info('create_features_profile :: determining the full duration from anomaly_check_file - %s' % anomaly_check_file) # Read the details file with open(anomaly_check_file, 'r') as f: anomaly_details = f.readlines() for i, line in enumerate(anomaly_details): if 'full_duration' in line: _ts_full_duration = '%s' % str(line).split("'", 2) full_duration_array = literal_eval(_ts_full_duration) ts_full_duration = str(int(full_duration_array[1])) current_logger.info('create_features_profile :: determined the full duration as - %s' % str(ts_full_duration)) if path.isfile(features_profile_created_file): # Read the created file with open(features_profile_created_file, 'r') as f: fp_created_str = f.read() fp_created = literal_eval(fp_created_str) new_fp_id = fp_created[0] return str(new_fp_id), True, True, fail_msg, trace # Have data if path.isfile(features_file): current_logger.info('create_features_profile :: features_file exists: %s' % features_file) else: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: features_file does not exist: %s' % features_file current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # Raise to webbapp I believe to provide traceback to user in UI raise else: return False, False, False, fail_msg, trace features_data = [] with open(features_file, 'rb') as fr: reader = csv.reader(fr, delimiter=',') for i, line in enumerate(reader): feature_name_item = False fname_id = False f_value = False feature_name = str(line[0]) feature_name_item = filter( lambda x: x[1] == feature_name, TSFRESH_FEATURES) if feature_name_item: feature_name_id = feature_name_item[0] if feature_name_item: feature_name_list = feature_name_item[0] fname_id = int(feature_name_list[0]) f_value = str(line[1]) if fname_id and f_value: features_data.append([fname_id, f_value]) # @added 20170113 - Feature #1854: Ionosphere learn - generations # Set the learn generations variables with the IONOSPHERE_LEARN_DEFAULT_ and any # settings.IONOSPHERE_LEARN_NAMESPACE_CONFIG values. These will later be # overridden by any database values determined for the specific metric if # they exist. # Set defaults use_full_duration_days = int(settings.IONOSPHERE_LEARN_DEFAULT_FULL_DURATION_DAYS) valid_learning_duration = int(settings.IONOSPHERE_LEARN_DEFAULT_VALID_TIMESERIES_OLDER_THAN_SECONDS) max_generations = int(settings.IONOSPHERE_LEARN_DEFAULT_MAX_GENERATIONS) max_percent_diff_from_origin = float(settings.IONOSPHERE_LEARN_DEFAULT_MAX_PERCENT_DIFF_FROM_ORIGIN) try: use_full_duration, valid_learning_duration, use_full_duration_days, max_generations, max_percent_diff_from_origin = get_ionosphere_learn_details(current_skyline_app, base_name) learn_full_duration_days = use_full_duration_days except: current_logger.error(traceback.format_exc()) current_logger.error('error :: create_features_profile :: failed to get_ionosphere_learn_details') current_logger.info('create_features_profile :: learn_full_duration_days :: %s days' % (str(learn_full_duration_days))) current_logger.info('create_features_profile :: valid_learning_duration :: %s seconds' % (str(valid_learning_duration))) current_logger.info('create_features_profile :: max_generations :: %s' % (str(max_generations))) current_logger.info('create_features_profile :: max_percent_diff_from_origin :: %s' % (str(max_percent_diff_from_origin))) current_logger.info('create_features_profile :: getting MySQL engine') try: engine, fail_msg, trace = fp_create_get_an_engine(current_skyline_app) current_logger.info(fail_msg) except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: could not get a MySQL engine' current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # Raise to webbapp I believe to provide traceback to user in UI raise else: return False, False, False, fail_msg, trace if not engine: trace = 'none' fail_msg = 'error :: create_features_profile :: engine not obtained' current_logger.error(fail_msg) if context == 'training' or context == 'features_profile': # Raise to webbapp I believe to provide traceback to user in UI raise else: return False, False, False, fail_msg, trace # Get metric details from the database metrics_id = False # Use the learn details as per config metric_learn_full_duration_days = int(use_full_duration_days) metric_learn_valid_ts_older_than = int(valid_learning_duration) metric_max_generations = int(max_generations) metric_max_percent_diff_from_origin = int(max_percent_diff_from_origin) metrics_table = None try: metrics_table, fail_msg, trace = metrics_table_meta(current_skyline_app, engine) current_logger.info(fail_msg) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to get metrics_table meta for %s' % base_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: disposing of any engine') fp_create_engine_disposal(current_skyline_app, engine) return False, False, False, fail_msg, trace current_logger.info('create_features_profile :: metrics_table OK') metric_db_object = None try: connection = engine.connect() # @modified 20161209 - - Branch #922: ionosphere # Task #1658: Patterning Skyline Ionosphere # result = connection.execute('select id from metrics where metric=\'%s\'' % base_name) # for row in result: # while not metrics_id: # metrics_id = row['id'] stmt = select([metrics_table]).where(metrics_table.c.metric == base_name) result = connection.execute(stmt) for row in result: metrics_id = row['id'] # @added 20170113 - Feature #1854: Ionosphere learn - generations # Added Ionosphere LEARN generation related variables try: metric_learn_full_duration_days = int(row['learn_full_duration_days']) metric_learn_valid_ts_older_than = int(row['learn_valid_ts_older_than']) metric_max_generations = int(row['max_generations']) metric_max_percent_diff_from_origin = float(row['max_percent_diff_from_origin']) except: current_logger.error('error :: create_features_profile :: failed to determine learn related values from DB for %s' % base_name) row = result.fetchone() # metric_db_object = row connection.close() current_logger.info('create_features_profile :: determined db metric id: %s' % str(metrics_id)) current_logger.info('create_features_profile :: determined db metric learn_full_duration_days: %s' % str(metric_learn_full_duration_days)) current_logger.info('create_features_profile :: determined db metric learn_valid_ts_older_than: %s' % str(metric_learn_valid_ts_older_than)) current_logger.info('create_features_profile :: determined db metric max_generations: %s' % str(metric_max_generations)) current_logger.info('create_features_profile :: determined db metric max_percent_diff_from_origin: %s' % str(metric_max_percent_diff_from_origin)) except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: could not determine id of metric from DB: %s' % base_name current_logger.error('%s' % fail_msg) if metric_learn_full_duration_days: learn_full_duration_days = metric_learn_full_duration_days # learn_full_duration = int(learn_full_duration_days) * 86400 if metric_learn_valid_ts_older_than: learn_valid_ts_older_than = metric_learn_valid_ts_older_than if metric_max_generations: max_generations = metric_max_generations if metric_max_percent_diff_from_origin: max_percent_diff_from_origin = metric_max_percent_diff_from_origin current_logger.info('create_features_profile :: generation info - learn_full_duration_days :: %s' % (str(learn_full_duration_days))) current_logger.info('create_features_profile :: generation info - learn_valid_ts_older_than :: %s' % (str(learn_valid_ts_older_than))) current_logger.info('create_features_profile :: generation info - max_generations :: %s' % (str(max_generations))) current_logger.info('create_features_profile :: generation info - max_percent_diff_from_origin :: %s' % (str(max_percent_diff_from_origin))) # @added 20170120 - Feature #1854: Ionosphere learn # Always use the timestamp from the anomaly file use_anomaly_timestamp = int(requested_timestamp) if context == 'ionosphere_learn': if path.isfile(anomaly_check_file): current_logger.info('create_features_profile :: determining the full duration from anomaly_check_file - %s' % anomaly_check_file) # Read the details file with open(anomaly_check_file, 'r') as f: anomaly_details = f.readlines() for i, line in enumerate(anomaly_details): if 'metric_timestamp' in line: _metric_timestamp = '%s' % str(line).split("'", 2) metric_timestamp_array = literal_eval(_metric_timestamp) use_anomaly_timestamp = (int(metric_timestamp_array[1])) current_logger.info('create_features_profile :: determined the anomaly metric_timestamp as - %s' % str(use_anomaly_timestamp)) ionosphere_table = None try: ionosphere_table, fail_msg, trace = ionosphere_table_meta(current_skyline_app, engine) current_logger.info(fail_msg) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to get ionosphere_table meta for %s' % base_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # Raise to webbapp I believe to provide traceback to user in UI # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) raise else: current_logger.info('create_features_profile :: disposing of any engine') fp_create_engine_disposal(current_skyline_app, engine) return False, False, False, fail_msg, trace current_logger.info('create_features_profile :: ionosphere_table OK') # @added 20170403 - Feature #2000: Ionosphere - validated # Set all learn_fp_human features profiles to validated. fp_validated = 0 if ionosphere_job == 'learn_fp_human': fp_validated = 1 # @added 20170424 - Feature #2000: Ionosphere - validated # Set all generation 0 and 1 as validated if int(fp_generation) <= 1: fp_validated = 1 new_fp_id = False try: connection = engine.connect() # @added 20170113 - Feature #1854: Ionosphere learn # Added learn values parent_id, generation # @modified 20170120 - Feature #1854: Ionosphere learn # Added anomaly_timestamp # @modified 20170403 - Feature #2000: Ionosphere - validated ins = ionosphere_table.insert().values( metric_id=int(metrics_id), full_duration=int(ts_full_duration), anomaly_timestamp=int(use_anomaly_timestamp), enabled=1, tsfresh_version=str(tsfresh_version), calc_time=calculated_time, features_count=fcount, features_sum=fsum, parent_id=fp_parent_id, generation=fp_generation, validated=fp_validated) result = connection.execute(ins) connection.close() new_fp_id = result.inserted_primary_key[0] current_logger.info('create_features_profile :: new ionosphere fp_id: %s' % str(new_fp_id)) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to insert a new record into the ionosphere table for %s' % base_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: disposing of any engine') fp_create_engine_disposal(current_skyline_app, engine) return False, False, False, fail_msg, trace if not RepresentsInt(new_fp_id): trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: unknown new ionosphere new_fp_id for %s' % base_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: disposing of any engine') fp_create_engine_disposal(current_skyline_app, engine) return False, False, False, fail_msg, trace # Create z_fp_<metric_id> table fp_table_created = False fp_table_name = 'z_fp_%s' % str(metrics_id) try: fp_meta = MetaData() # @modified 20161222 - Task #1812: z_fp table type # Changed to InnoDB from MyISAM as no files open issues and MyISAM clean # up, there can be LOTS of file_per_table z_fp_ tables/files without # the MyISAM issues. z_fp_ tables are mostly read and will be shuffled # in the table cache as required. fp_metric_table = Table( fp_table_name, fp_meta, Column('id', Integer, primary_key=True), Column('fp_id', Integer, nullable=False, key='fp_id'), Column('feature_id', Integer, nullable=False), Column('value', DOUBLE(), nullable=True), mysql_charset='utf8', mysql_key_block_size='255', mysql_engine='InnoDB') fp_metric_table.create(engine, checkfirst=True) fp_table_created = True except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to create table - %s' % fp_table_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context) if not fp_table_created: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to determine True for create table - %s' % fp_table_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context) # Insert features and values insert_statement = [] for fname_id, f_value in features_data: insert_statement.append({'fp_id': new_fp_id, 'feature_id': fname_id, 'value': f_value},) if insert_statement == []: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: empty insert_statement for %s inserts' % fp_table_name current_logger.error('%s' % fail_msg) # raise # else: # feature_count = sum(1 for x in a if isinstance(x, insert_statement)) # current_logger.info( # 'fp_id - %s - %s feature values in insert_statement for %s ' % # (str(feature_count), str(new_fp_id), fp_table_name)) # feature_count = sum(1 for x in a if isinstance(x, insert_statement)) # current_logger.info( # 'fp_id - %s - feature values in insert_statement for %s ' % # (str(new_fp_id), fp_table_name)) try: connection = engine.connect() connection.execute(fp_metric_table.insert(), insert_statement) connection.close() current_logger.info('create_features_profile :: fp_id - %s - feature values inserted into %s' % (str(new_fp_id), fp_table_name)) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to insert a feature values into %s' % fp_table_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context) # Create metric ts table if not exists ts_<metric_id> # Create z_ts_<metric_id> table # @modified 20170121 - Feature #1854: Ionosphere learn - generations # TODO Adding the option to not save timeseries to DB, as default? # ts_table_created = False ts_table_name = 'z_ts_%s' % str(metrics_id) try: ts_meta = MetaData() # @modified 20161222 - Task #1812: z_fp table type # Changed to InnoDB from MyISAM as no files open issues and MyISAM clean # up, there can be LOTS of file_per_table z_fp_ tables/files without # the MyISAM issues. z_fp_ tables are mostly read and will be shuffled # in the table cache as required. ts_metric_table = Table( ts_table_name, ts_meta, Column('id', Integer, primary_key=True), Column('fp_id', Integer, nullable=False, key='fp_id'), Column('timestamp', Integer, nullable=False), Column('value', DOUBLE(), nullable=True), mysql_charset='utf8', mysql_key_block_size='255', mysql_engine='InnoDB') ts_metric_table.create(engine, checkfirst=True) # ts_table_created = True current_logger.info('create_features_profile :: metric ts table created OK - %s' % (ts_table_name)) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to create table - %s' % ts_table_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context) # Insert timeseries that the features profile was created from raw_timeseries = [] anomaly_json = '%s/%s.json' % (metric_training_data_dir, base_name) if path.isfile(anomaly_json): current_logger.info('create_features_profile :: metric anomaly json found OK - %s' % (anomaly_json)) try: # Read the timeseries json file with open(anomaly_json, 'r') as f: raw_timeseries = f.read() except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: failed to read timeseries data from %s' % anomaly_json current_logger.error('%s' % (fail_msg)) fail_msg = 'error: failed to read timeseries data from %s' % anomaly_json # end = timer() if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) # Raise to webbapp I believe to provide traceback to user in UI raise else: trace = 'none' fail_msg = 'error: file not found - %s' % (anomaly_json) current_logger.error(fail_msg) # raise # Convert the timeseries to csv timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']') timeseries = literal_eval(timeseries_array_str) datapoints = timeseries validated_timeseries = [] for datapoint in datapoints: try: new_datapoint = [str(int(datapoint[0])), float(datapoint[1])] validated_timeseries.append(new_datapoint) # @modified 20170913 - Task #2160: Test skyline with bandit # Added nosec to exclude from bandit tests except: # nosec continue insert_statement = [] for ts, value in validated_timeseries: insert_statement.append({'fp_id': new_fp_id, 'timestamp': ts, 'value': value},) try: connection = engine.connect() connection.execute(ts_metric_table.insert(), insert_statement) connection.close() current_logger.info('create_features_profile :: fp_id - %s - timeseries inserted into %s' % (str(new_fp_id), ts_table_name)) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to insert the timeseries into %s' % ts_table_name current_logger.error('%s' % fail_msg) if context == 'training' or context == 'features_profile': # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) raise else: current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context) # Create a created features profile file try: # data = '[%s, %s, ]' % (new_fp_id, str(int(time.time()))) # write_data_to_file(skyline_app, features_profile_created_file, 'w', data) # @modified 20170115 - Feature #1854: Ionosphere learn - generations # Added parent_id and generation data = '[%s, %s, \'%s\', %s, %s, %s, %s, %s, %s]' % ( new_fp_id, str(int(time.time())), str(tsfresh_version), str(calculated_time), str(fcount), str(fsum), str(ts_full_duration), str(fp_parent_id), str(fp_generation)) write_data_to_file(current_skyline_app, features_profile_created_file, 'w', data) except: trace = traceback.format_exc() current_logger.error('%s' % trace) fail_msg = 'error :: create_features_profile :: failed to write fp.created file' current_logger.error('%s' % fail_msg) # Set ionosphere_enabled for the metric try: # update_statement = 'UPDATE metrics SET ionosphere_enabled=1 WHERE id=%s' % str(metrics_id) connection = engine.connect() # result = connection.execute('UPDATE metrics SET ionosphere_enabled=1 WHERE id=%s' % str(metrics_id)) # connection.execute(ts_metric_table.insert(), insert_statement) connection.execute( metrics_table.update( metrics_table.c.id == metrics_id).values(ionosphere_enabled=1)) connection.close() current_logger.info('create_features_profile :: ionosphere_enabled set on metric id: %s' % str(metrics_id)) except: trace = traceback.format_exc() current_logger.error(trace) fail_msg = 'error :: create_features_profile :: could not update metrics table and set ionosphere_enabled on id %s' % str(metrics_id) current_logger.error('%s' % fail_msg) # raise # Copy data from training data dir to features_profiles dir if not path.isdir(ts_features_profile_dir): mkdir_p(ts_features_profile_dir) if path.isdir(ts_features_profile_dir): current_logger.info('create_features_profile :: fp_id - %s - features profile dir created - %s' % (str(new_fp_id), ts_features_profile_dir)) # src_files = os.listdir(src) # for file_name in src_files: # full_file_name = path.join(src, file_name) # if (path.isfile(full_file_name)): # shutil.copy(full_file_name, dest) data_files = [] try: glob_path = '%s/*.*' % metric_training_data_dir data_files = glob.glob(glob_path) except: trace = traceback.format_exc() current_logger.error('%s' % trace) current_logger.error('error :: create_features_profile :: glob - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir)) for i_file in data_files: try: shutil.copy(i_file, ts_features_profile_dir) current_logger.info('create_features_profile :: fp_id - %s - training data copied - %s' % (str(new_fp_id), i_file)) except shutil.Error as e: trace = traceback.format_exc() current_logger.error('%s' % trace) current_logger.error('error :: create_features_profile :: shutil error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir)) current_logger.error('error :: create_features_profile :: %s' % (e)) # Any error saying that the directory doesn't exist except OSError as e: trace = traceback.format_exc() current_logger.error('%s' % trace) current_logger.error('error :: create_features_profile :: OSError error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir)) current_logger.error('error :: create_features_profile :: %s' % (e)) current_logger.info('create_features_profile :: fp_id - %s - training data copied to %s' % (str(new_fp_id), ts_features_profile_dir)) else: current_logger.error('error :: create_features_profile :: fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir)) current_logger.info('create_features_profile :: disposing of any engine') try: if engine: fp_create_engine_disposal(current_skyline_app, engine) else: current_logger.info('create_features_profile :: no engine to dispose of' % (str(new_fp_id), ts_features_profile_dir)) except: trace = traceback.format_exc() current_logger.error('%s' % trace) current_logger.error('error :: create_features_profile :: OSError error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir)) # @added 20170113 - Feature #1854: Ionosphere learn - Redis ionosphere.learn.work namespace # Ionosphere learn needs Redis works sets # When a features profile is created there needs to be work added to a Redis # set. When a human makes a features profile, we want Ionosphere to make a # use_full_duration_days features profile valid_learning_duration (e.g. # 3361) later. if settings.IONOSPHERE_LEARN and new_fp_id: create_redis_work_item = False if context == 'training_data' and ionosphere_job == 'learn_fp_human': create_redis_work_item = True # @modified 20170120 - Feature #1854: Ionosphere learn - generations # Added fp_learn parameter to allow the user to not learn the # use_full_duration_days if not fp_learn: create_redis_work_item = False current_logger.info('fp_learn is False not adding an item to Redis ionosphere.learn.work set') if ionosphere_job == 'learn_fp_automatic': create_redis_work_item = True # @added 20170131 - Feature #1886 Ionosphere learn - child like parent with evolutionary maturity # TODO: here a check may be required to evaluate whether the origin_fp_id # had a use_full_duration features profile created, however # due to the fact that it is in learn, suggests that it did # have, not 100% sure. origin_fp_id_was_allowed_to_learn = False child_use_full_duration_count_of_origin_fp_id = 1 # TODO: Determine the state # child_use_full_duration_count_of_origin_fp_id = SELECT COUNT(id) FROM ionosphere WHERE parent_id=origin_fp_id AND full_duration=use_full_duration if child_use_full_duration_count_of_origin_fp_id == 0: current_logger.info('the origin parent was not allowed to learn not adding to Redis ionosphere.learn.work set') create_redis_work_item = False if create_redis_work_item: try: current_logger.info( 'adding work to Redis ionosphere.learn.work set - [\'Soft\', \'%s\', %s, \'%s\', %s, %s] to make a learn features profile later' % ( str(ionosphere_job), str(requested_timestamp), base_name, str(new_fp_id), str(fp_generation))) redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) redis_conn.sadd('ionosphere.learn.work', ['Soft', str(ionosphere_job), int(requested_timestamp), base_name, int(new_fp_id), int(fp_generation)]) except: current_logger.error(traceback.format_exc()) current_logger.error( 'error :: failed adding work to Redis ionosphere.learn.work set - [\'Soft\', \'%s\', %s, \'%s\', %s, %s] to make a learn features profile later' % ( str(ionosphere_job), str(requested_timestamp), base_name, str(new_fp_id), str(fp_generation))) # @added 20170806 - Bug #2130: MySQL - Aborted_clients # Added missing disposal if engine: fp_create_engine_disposal(current_skyline_app, engine) return str(new_fp_id), True, False, fail_msg, trace
def spin_process(self, i, run_timestamp): """ Assign a metric for a process to analyze. """ # Discover metric to analyze metric_var_files = [f for f in listdir(settings.MIRAGE_CHECK_PATH) if isfile(join(settings.MIRAGE_CHECK_PATH, f))] # Check if this process is unnecessary if len(metric_var_files) == 0: return metric_var_files_sorted = sorted(metric_var_files) metric_check_file = '%s/%s' % ( settings.MIRAGE_CHECK_PATH, str(metric_var_files_sorted[0])) # Load metric variables self.load_metric_vars(metric_check_file) # Test metric variables if len(metric_vars.metric) == 0: return else: metric = metric_vars.metric metric_name = ['metric_name', metric_vars.metric] self.metric_variables.append(metric_name) if len(metric_vars.value) == 0: return else: metric_value = ['metric_value', metric_vars.value] self.metric_variables.append(metric_value) if len(metric_vars.hours_to_resolve) == 0: return else: hours_to_resolve = ['hours_to_resolve', metric_vars.hours_to_resolve] self.metric_variables.append(hours_to_resolve) if len(metric_vars.metric_timestamp) == 0: return else: metric_timestamp = ['metric_timestamp', metric_vars.metric_timestamp] self.metric_variables.append(metric_timestamp) # Ignore any metric check with a timestamp greater than 10 minutes ago int_metric_timestamp = int(metric_vars.metric_timestamp) int_run_timestamp = int(run_timestamp) metric_timestamp_age = int_run_timestamp - int_metric_timestamp if metric_timestamp_age > settings.MIRAGE_STALE_SECONDS: logger.info('stale check :: %s check request is %s seconds old - discarding' % (metric_vars.metric, metric_timestamp_age)) # Remove metric check file # try: # os.remove(metric_check_file) # except OSError: # pass # return if os.path.exists(metric_check_file): os.remove(metric_check_file) logger.info('removed %s' % (metric_check_file)) else: logger.info('could not remove %s' % (metric_check_file)) # Calculate hours second order resolution to seconds second_order_resolution_seconds = int(metric_vars.hours_to_resolve) * 3600 # Calculate graphite from and until parameters from the metric timestamp graphite_until = datetime.datetime.fromtimestamp(int(metric_vars.metric_timestamp)).strftime('%H:%M_%Y%m%d') int_second_order_resolution_seconds = int(second_order_resolution_seconds) second_resolution_timestamp = int_metric_timestamp - int_second_order_resolution_seconds graphite_from = datetime.datetime.fromtimestamp(int(second_resolution_timestamp)).strftime('%H:%M_%Y%m%d') # Remove any old json file related to the metric metric_json_file = '%s/%s/%s.json' % ( settings.MIRAGE_DATA_FOLDER, str(metric_vars.metric), str(metric_vars.metric)) try: os.remove(metric_json_file) except OSError: pass # Get data from graphite logger.info( 'retrieve data :: surfacing %s timeseries from graphite for %s seconds' % ( metric_vars.metric, second_order_resolution_seconds)) self.surface_graphite_metric_data(metric_vars.metric, graphite_from, graphite_until) # Check there is a json timeseries file to test if not os.path.isfile(metric_json_file): logger.error( 'error :: retrieve failed - failed to surface %s timeseries from graphite' % ( metric_vars.metric)) # Remove metric check file try: os.remove(metric_check_file) except OSError: pass return else: logger.info('retrieved data :: for %s at %s seconds' % ( metric_vars.metric, second_order_resolution_seconds)) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) self.check_if_parent_is_alive() with open((metric_json_file), 'r') as f: timeseries = json.loads(f.read()) logger.info('data points surfaced :: %s' % (len(timeseries))) try: logger.info('analyzing :: %s at %s seconds' % (metric_vars.metric, second_order_resolution_seconds)) anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_vars.metric, second_order_resolution_seconds) # If it's anomalous, add it to list if anomalous: base_name = metric.replace(settings.FULL_NAMESPACE, '', 1) anomalous_metric = [datapoint, base_name] self.anomalous_metrics.append(anomalous_metric) logger.info('anomaly detected :: %s with %s' % (metric_vars.metric, metric_vars.value)) # It runs so fast, this allows us to process 30 anomalies/min sleep(2) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: algorithm = settings.MIRAGE_ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # If Crucible or Panorama are enabled determine details determine_anomaly_details = False if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED: determine_anomaly_details = True if settings.PANORAMA_ENABLED: determine_anomaly_details = True if determine_anomaly_details: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') # If Panorama is enabled - create a Panorama check if settings.PANORAMA_ENABLED: if not os.path.exists(settings.PANORAMA_CHECK_PATH): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(settings.PANORAMA_CHECK_PATH, mode_arg) # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used results in a shift in the # decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 added_at = str(int(time())) source = 'graphite' panaroma_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'app = \'%s\'\n' \ 'source = \'%s\'\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.MIRAGE_ALGORITHMS), triggered_algorithms, skyline_app, source, this_host, added_at) # Create an anomaly file with details about the anomaly panaroma_anomaly_file = '%s/%s.%s.txt' % ( settings.PANORAMA_CHECK_PATH, added_at, base_name) try: write_data_to_file( skyline_app, panaroma_anomaly_file, 'w', panaroma_anomaly_data) logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file)) except: logger.error('error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file)) logger.info(traceback.format_exc()) # If crucible is enabled - save timeseries and create a # crucible check if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp if not os.path.exists(crucible_anomaly_dir): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(crucible_anomaly_dir, mode_arg) # Note: # The value is enclosed is single quoted intentionally # as the imp.load_source used in crucible results in a # shift in the decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 crucible_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'anomaly_dir = \'%s\'\n' \ 'graphite_metric = True\n' \ 'run_crucible_tests = False\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.MIRAGE_ALGORITHMS), triggered_algorithms, crucible_anomaly_dir, skyline_app, metric_timestamp) # Create an anomaly file with details about the anomaly crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, base_name) try: write_data_to_file( skyline_app, crucible_anomaly_file, 'w', crucible_anomaly_data) logger.info('added crucible anomaly file :: %s' % (crucible_anomaly_file)) except: logger.error('error :: failed to add crucible anomaly file :: %s' % (crucible_anomaly_file)) logger.info(traceback.format_exc()) # Create timeseries json file with the timeseries json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name) timeseries_json = str(timeseries).replace('[', '(').replace(']', ')') try: write_data_to_file(skyline_app, json_file, 'w', timeseries_json) logger.info('added crucible timeseries file :: %s' % (json_file)) except: logger.error('error :: failed to add crucible timeseries file :: %s' % (json_file)) logger.info(traceback.format_exc()) # Create a crucible check file crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name) try: write_data_to_file( skyline_app, crucible_check_file, 'w', crucible_anomaly_data) logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp)) except: logger.error('error :: failed to add crucible check file :: %s' % (crucible_check_file)) logger.info(traceback.format_exc()) else: base_name = metric.replace(settings.FULL_NAMESPACE, '', 1) not_anomalous_metric = [datapoint, base_name] self.not_anomalous_metrics.append(not_anomalous_metric) logger.info('not anomalous :: %s with %s' % (metric_vars.metric, metric_vars.value)) # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 logger.info('exceptions :: DeletedByRoomba') except TooShort: exceptions['TooShort'] += 1 logger.info('exceptions :: TooShort') except Stale: exceptions['Stale'] += 1 logger.info('exceptions :: Stale') except Boring: exceptions['Boring'] += 1 logger.info('exceptions :: Boring') except: exceptions['Other'] += 1 logger.info('exceptions :: Other') logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.mirage_anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.mirage_exceptions_q.put((key, value)) # Remove metric check file try: os.remove(metric_check_file) except OSError: pass
def submit_crucible_job(from_timestamp, until_timestamp, metrics_list, namespaces_list, source, alert_interval, user_id, user, add_to_panorama, pad_timeseries, training_data_json, run_algorithms): """ Get a list of all the metrics passed and generate Crucible check files for each :param from_timestamp: the timestamp at which to start the time series :param until_timestamp: the timestamp at which to end the time series :param metrics_list: a list of metric names to analyse :param namespaces_list: a list of metric namespaces to analyse :param source: the source webapp making the request :param alert_interval: the alert_interval at which Crucible should trigger anomalies :param user_id: the user id of the user making the request :param user: the username making the request :param add_to_panorama: whether Crucible should add Skyline CONSENSUS anomalies to Panorama :param pad_timeseries: the amount of data to pad the time series with :param training_data_json: the full path to the training_data json file if source is training_data :param run_algorithms: list of algorithms to run :type from_timestamp: int :type until_timestamp: int :type metrics_list: list :type namespaces_list: list :type source: str :type alert_interval: int :type user_id: int :type user: str :type add_to_panorama: boolean :type pad_timeseries: str :type training_data_json: str :type run_algorithms: list :return: tuple of lists :rtype: (list, list, list, list) Returns (crucible_job_id, metrics_submitted_to_process, fail_msg, trace) """ fail_msg = None trace = None crucible_job_id = None metrics_submitted_to_process = 0 # Generate a job id based on the YMDHMS.user_id and a job directory try: jobid_timestamp = int(time()) jobid_datetimestamp = dt.datetime.fromtimestamp( jobid_timestamp).strftime('%Y%m%d%H%M%S') crucible_job_id = '%s.%s' % (str(jobid_datetimestamp), str(user_id)) except: logger.error(traceback.format_exc()) logger.error('error :: failed to determine a crucible_job_id') raise # to webapp to return in the UI # Generate a job id based on the YMDHMS.user_id and a job directory try: crucible_path = os.path.dirname(settings.CRUCIBLE_DATA_FOLDER) crucible_job_dir = '%s/jobs/%s' % (crucible_path, crucible_job_id) if not path.exists(crucible_job_dir): logger.info('creating crucible job directory - %s' % (str(crucible_job_dir))) mkdir_p(crucible_job_dir) except: trace = traceback.format_exc() fail_msg = 'error :: failed to create the crucible job directory' logger.error(trace) logger.error(fail_msg) raise # to webapp to return in the UI # TODO added checks of metric names metric_names = [] if metrics_list: logger.info('submit_crucible_job :: %s metrics passed' % str(len(metrics_list))) for metric in metrics_list: metric_names.append(metric) # TODO added checks of metric namespaces, harder to do, but so that the UI # errors to the usr rather than sending a bad or non-existent metric to # Crucible if namespaces_list: logger.info('submit_crucible_job :: %s namespaces passed' % str(len(namespaces_list))) logger.info( 'submit_crucible_job :: determine metrics for submit_crucible_job between %s and %s' % (str(from_timestamp), str(until_timestamp))) logger.info('getting MySQL engine') try: engine, fail_msg, trace = get_an_engine() logger.info(fail_msg) except: trace = traceback.format_exc() logger.error(trace) logger.error('%s' % fail_msg) logger.error( 'error :: could not get a MySQL engine to get metric names') raise # to webapp to return in the UI if not engine: trace = 'none' fail_msg = 'error :: engine not obtained' logger.error(fail_msg) raise try: metrics_table, log_msg, trace = metrics_table_meta( skyline_app, engine) logger.info(log_msg) logger.info('metrics_table OK') except: logger.error(traceback.format_exc()) logger.error('error :: failed to get metrics_table meta') if engine: engine_disposal(engine) raise # to webapp to return in the UI metrics_like_query = text( """SELECT metric FROM metrics WHERE metric LIKE :like_string""") for namespace in namespaces_list: try: connection = engine.connect() results = connection.execute(metrics_like_query, like_string=str(namespace)) connection.close() for row in results: metric_name = str(row[0]) metric_names.append(metric_name) except: trace = traceback.format_exc() logger.error(trace) logger.error( 'error :: could not determine metrics from metrics table') if engine: engine_disposal(engine) raise logger.info( 'submit_crucible_job :: %s metrics determined from passed namespaces' % str(len(metric_names))) logger.info('submit_crucible_job :: %s metrics to process' % str(len(metric_names))) metrics_submitted_to_process = [] datapoint = 0 triggered_algorithms = [ 'histogram_bins', 'first_hour_average', 'stddev_from_average', 'grubbs', 'ks_test', 'mean_subtraction_cumulation', 'median_absolute_deviation', 'stddev_from_moving_average', 'least_squares' ] added_at = int(time()) for base_name in metric_names: sane_metricname = filesafe_metricname(str(base_name)) derivative_metric = is_derivative_metric(skyline_app, base_name) if derivative_metric: target = 'nonNegativeDerivative(%s)' % base_name else: target = base_name # Generate a metric job directory crucible_anomaly_dir = '%s/%s' % (crucible_job_dir, sane_metricname) try: if not path.exists(crucible_anomaly_dir): logger.info('creating crucible metric job directory - %s' % (str(crucible_anomaly_dir))) mkdir_p(crucible_anomaly_dir) except: trace = traceback.format_exc() fail_msg = 'error :: failed to create the crucible metric job directory' logger.error(trace) logger.error(fail_msg) raise # to webapp to return in the UI if source == 'graphite': graphite_metric = True else: graphite_metric = False # @added 20200422 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # In order for metrics to be analysed in Crucible like the Analyzer or # Mirage analysis, the time series data needs to be padded # Added pad_timeseries graphite_override_uri_parameters = 'from=%s&until=%s&target=%s' % ( str(from_timestamp), str(until_timestamp), target) timeseries_full_duration = int(until_timestamp) - int(from_timestamp) pad_timeseries_with = 0 if pad_timeseries == 'auto': if timeseries_full_duration > 3600: pad_timeseries_with = 3600 if timeseries_full_duration > 86400: pad_timeseries_with = 86400 if pad_timeseries == '86400': pad_timeseries_with = 86400 if pad_timeseries == '604800': pad_timeseries_with = 604800 if pad_timeseries == '0': pad_timeseries_with = 0 if pad_timeseries_with: try: padded_from_timestamp = int( from_timestamp) - pad_timeseries_with graphite_override_uri_parameters = 'from=%s&until=%s&target=%s' % ( str(padded_from_timestamp), str(until_timestamp), target) logger.info('padding time series with %s seconds - %s' % (str(pad_timeseries_with), str(graphite_override_uri_parameters))) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to construct graphite_override_uri_parameters with pad_timeseries_with %s' % str(pad_timeseries_with)) # @added 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms # Allow the user to pass algorithms to run algorithms = settings.ALGORITHMS if run_algorithms: algorithms = run_algorithms # @modified 20200421 - Feature #3500: webapp - crucible_process_metrics # Feature #1448: Crucible web UI # Added add_to_panorama # @added 20200607 - Feature #3630: webapp - crucible_process_training_data # Added training_data_json crucible_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'anomaly_dir = \'%s\'\n' \ 'graphite_metric = %s\n' \ 'run_crucible_tests = True\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ 'graphite_override_uri_parameters = \'%s\'\n' \ 'alert_interval = \'%s\'\n' \ 'add_to_panorama = %s\n' \ 'training_data_json = %s\n' \ % (base_name, str(datapoint), str(from_timestamp), # @modified 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms # str(until_timestamp), str(settings.ALGORITHMS), str(until_timestamp), str(algorithms), triggered_algorithms, crucible_anomaly_dir, str(graphite_metric), skyline_app, str(added_at), str(graphite_override_uri_parameters), str(alert_interval), str(add_to_panorama), str(training_data_json)) # Create an anomaly file with details about the anomaly crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, sane_metricname) try: write_data_to_file(skyline_app, crucible_anomaly_file, 'w', crucible_anomaly_data) logger.info('added crucible anomaly file :: %s' % (crucible_anomaly_file)) except: logger.error(traceback.format_exc()) logger.error('error :: failed to add crucible anomaly file :: %s' % (crucible_anomaly_file)) # Create a crucible check file crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, str(added_at), sane_metricname) try: write_data_to_file(skyline_app, crucible_check_file, 'w', crucible_anomaly_data) logger.info('added crucible check :: %s,%s' % (base_name, str(added_at))) metrics_submitted_to_process.append(base_name) except: logger.error(traceback.format_exc()) logger.error('error :: failed to add crucible check file :: %s' % (crucible_check_file)) return (crucible_job_id, metrics_submitted_to_process, fail_msg, trace)
def send_crucible_job_metric_to_panorama(crucible_job_id, base_name, user_id, user, skyline_consensus_anomalies): """ Send the Crucible Skyline CONSENSUS anomalies for a crucible_job and metric to Panorama to insert into the anomalies database. :param crucible_job_id: the crucible_job_id :param base_name: the metric name :param user_id: the user_id :param user: the username :param skyline_consensus_anomalies: the Crucible Skyline CONSENSUS anomalies :type crucible_job_id: str :type base_name: str :type user_id: int :type user: str :type skyline_consensus_anomalies: list :return: tuple of lists :rtype: (int, str, str) Returns (len(skyline_consensus_anomalies), fail_msg, trace) """ fail_msg = None trace = None added_at = int(time()) crucible_path = os.path.dirname(settings.CRUCIBLE_DATA_FOLDER) jobs_data_dir = '%s/jobs' % crucible_path data_dir = '%s/%s/%s' % (jobs_data_dir, crucible_job_id, base_name) crucible_job_details_filename = '%s.txt' % base_name crucible_job_details_file = '%s/%s' % (data_dir, crucible_job_details_filename) crucible_job_details = [] try: logger.info( 'send_crucible_job_metric_to_panorama :: getting crucible_job_details from file - %s' % (crucible_job_details_file)) with open(crucible_job_details_file) as f: for line in f: no_new_line = line.replace('\n', '') no_equal_line = no_new_line.replace(' = ', ',') array = str(no_equal_line.split(',', 1)) add_line = literal_eval(array) crucible_job_details.append(add_line) except: trace = traceback.format_exc() fail_msg = 'error :: send_crucible_job_metric_to_panorama - failed to get crucible_job_details from file - %s' % crucible_job_details_file logger.error(trace) logger.error(fail_msg) raise # to webapp to return in the UI try: timestamp_str = str(crucible_job_details[2][1]) new_timestamp_str = timestamp_str.replace("'", "") from_timestamp = int(new_timestamp_str) except: trace = traceback.format_exc() fail_msg = 'error :: send_crucible_job_metric_to_panorama - failed to determine from_timestamp from get crucible_job_details' logger.error(trace) logger.error(fail_msg) raise # to webapp to return in the UI label = 'Crucible job %s' % str(crucible_job_id) sane_metricname = filesafe_metricname(str(base_name)) # skyline_consensus_anomalies format # [timestamp, value, anomaly_score, triggered_algorithms] # [skyline_anomaly[0], skyline_anomaly[1], skyline_anomaly[2], skyline_anomaly[3]] # [1583234400, 44.39999999990687, 2, ['histogram_bins', 'median_absolute_deviation']], for timestamp, datapoint, anomaly_score, triggered_algorithms in skyline_consensus_anomalies: # To allow multiple Panorama anomaly files to added quickly just # increment the added_at by 1 seconds so that all the files have a # unique name added_at += 1 # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used results in a shift in the # decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 source = 'graphite' panaroma_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'app = \'%s\'\n' \ 'source = \'%s\'\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ 'label = \'%s\'\n' \ 'user_id = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, str(timestamp), str(settings.ALGORITHMS), triggered_algorithms, skyline_app, source, this_host, str(added_at), label, str(user_id)) # Create an anomaly file with details about the anomaly panaroma_anomaly_file = '%s/%s.%s.txt' % (settings.PANORAMA_CHECK_PATH, added_at, sane_metricname) try: write_data_to_file(skyline_app, panaroma_anomaly_file, 'w', panaroma_anomaly_data) logger.info( 'send_crucible_job_metric_to_panorama - added panorama anomaly file :: %s' % (panaroma_anomaly_file)) except: logger.error(traceback.format_exc()) logger.error( 'error :: send_crucible_job_metric_to_panorama - failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file)) crucible_job_sent_to_panorama_file = '%s/%s.%s.%s.sent_to_panorama.txt' % ( data_dir, str(added_at), crucible_job_id, base_name) panorama_done_data = [added_at, int(user_id), skyline_consensus_anomalies] try: write_data_to_file(skyline_app, crucible_job_sent_to_panorama_file, 'w', str(panorama_done_data)) logger.info( 'send_crucible_job_metric_to_panorama - added set to panorama crucible job file :: %s' % (crucible_job_sent_to_panorama_file)) logger.info( 'send_crucible_job_metric_to_panorama - with contents :: %s' % (panorama_done_data)) except: logger.error(traceback.format_exc()) logger.error( 'error :: send_crucible_job_metric_to_panorama - failed to add panorama crucible job file :: %s' % (crucible_job_sent_to_panorama_file)) return (len(skyline_consensus_anomalies), fail_msg, trace)
def spin_process(self, i, boundary_metrics): """ Assign a bunch of metrics for a process to analyze. """ # Determine assigned metrics bp = settings.BOUNDARY_PROCESSES bm_range = len(boundary_metrics) keys_per_processor = int(ceil(float(bm_range) / float(bp))) if i == settings.BOUNDARY_PROCESSES: assigned_max = len(boundary_metrics) else: # This is a skyine bug, the original skyline code uses 1 as the # beginning position of the index, python indices begin with 0 # assigned_max = len(boundary_metrics) # This closes the etsy/skyline pull request opened by @languitar on 17 Jun 2014 # https://github.com/etsy/skyline/pull/94 Fix analyzer worker metric assignment assigned_max = min(len(boundary_metrics), i * keys_per_processor) assigned_min = (i - 1) * keys_per_processor assigned_keys = range(assigned_min, assigned_max) # Compile assigned metrics assigned_metrics_and_algos = [boundary_metrics[index] for index in assigned_keys] if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: printing assigned_metrics_and_algos') for assigned_metric_and_algo in assigned_metrics_and_algos: logger.info('debug :: assigned_metric_and_algo - %s' % str(assigned_metric_and_algo)) # Compile assigned metrics assigned_metrics = [] for i in assigned_metrics_and_algos: assigned_metrics.append(i[0]) # unique unhashed things def unique_noHash(seq): seen = set() return [x for x in seq if str(x) not in seen and not seen.add(str(x))] unique_assigned_metrics = unique_noHash(assigned_metrics) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: unique_assigned_metrics - %s' % str(unique_assigned_metrics)) logger.info('debug :: printing unique_assigned_metrics:') for unique_assigned_metric in unique_assigned_metrics: logger.info('debug :: unique_assigned_metric - %s' % str(unique_assigned_metric)) # Check if this process is unnecessary if len(unique_assigned_metrics) == 0: return # Multi get series try: raw_assigned = self.redis_conn.mget(unique_assigned_metrics) except: logger.error('error :: failed to mget assigned_metrics from redis') return # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) # Reset boundary_algortims all_boundary_algorithms = [] for metric in BOUNDARY_METRICS: all_boundary_algorithms.append(metric[1]) # The unique algorithms that are being used boundary_algorithms = unique_noHash(all_boundary_algorithms) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: boundary_algorithms - %s' % str(boundary_algorithms)) discover_run_metrics = [] # Distill metrics into a run list for i, metric_name, in enumerate(unique_assigned_metrics): self.check_if_parent_is_alive() try: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: unpacking timeseries for %s - %s' % (metric_name, str(i))) raw_series = raw_assigned[i] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) except Exception as e: exceptions['Other'] += 1 logger.error('error :: redis data error: ' + traceback.format_exc()) logger.error('error :: %e' % e) base_name = metric_name.replace(FULL_NAMESPACE, '', 1) # Determine the metrics BOUNDARY_METRICS metric tuple settings for metrick in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metrick[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) metric_pattern_matched = False if pattern_match: metric_pattern_matched = True algo_pattern_matched = False for algo in boundary_algorithms: for metric in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metric[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) if pattern_match: if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: metric and algo pattern MATCHED - " + metric[0] + " | " + base_name + " | " + str(metric[1])) metric_expiration_time = False metric_min_average = False metric_min_average_seconds = False metric_trigger = False algorithm = False algo_pattern_matched = True algorithm = metric[1] try: if metric[2]: metric_expiration_time = metric[2] except: metric_expiration_time = False try: if metric[3]: metric_min_average = metric[3] except: metric_min_average = False try: if metric[4]: metric_min_average_seconds = metric[4] except: metric_min_average_seconds = 1200 try: if metric[5]: metric_trigger = metric[5] except: metric_trigger = False try: if metric[6]: alert_threshold = metric[6] except: alert_threshold = False try: if metric[7]: metric_alerters = metric[7] except: metric_alerters = False if metric_pattern_matched and algo_pattern_matched: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: added metric - %s, %s, %s, %s, %s, %s, %s, %s, %s' % (str(i), metric_name, str(metric_expiration_time), str(metric_min_average), str(metric_min_average_seconds), str(metric_trigger), str(alert_threshold), metric_alerters, algorithm)) discover_run_metrics.append([i, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm]) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: printing discover_run_metrics') for discover_run_metric in discover_run_metrics: logger.info('debug :: discover_run_metrics - %s' % str(discover_run_metric)) logger.info('debug :: build unique boundary metrics to analyze') # Determine the unique set of metrics to run run_metrics = unique_noHash(discover_run_metrics) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: printing run_metrics') for run_metric in run_metrics: logger.info('debug :: run_metrics - %s' % str(run_metric)) # Distill timeseries strings and submit to run_selected_algorithm for metric_and_algo in run_metrics: self.check_if_parent_is_alive() try: raw_assigned_id = metric_and_algo[0] metric_name = metric_and_algo[1] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) metric_expiration_time = metric_and_algo[2] metric_min_average = metric_and_algo[3] metric_min_average_seconds = metric_and_algo[4] metric_trigger = metric_and_algo[5] alert_threshold = metric_and_algo[6] metric_alerters = metric_and_algo[7] algorithm = metric_and_algo[8] if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: unpacking timeseries for %s - %s' % (metric_name, str(raw_assigned_id))) raw_series = raw_assigned[metric_and_algo[0]] unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: unpacked OK - %s - %s' % (metric_name, str(raw_assigned_id))) autoaggregate = False autoaggregate_value = 0 # Determine if the namespace is to be aggregated if BOUNDARY_AUTOAGGRERATION: for autoaggregate_metric in BOUNDARY_AUTOAGGRERATION_METRICS: autoaggregate = False autoaggregate_value = 0 CHECK_MATCH_PATTERN = autoaggregate_metric[0] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) if pattern_match: autoaggregate = True autoaggregate_value = autoaggregate_metric[1] if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: BOUNDARY_AUTOAGGRERATION passed - %s - %s' % (metric_name, str(autoaggregate))) if ENABLE_BOUNDARY_DEBUG: logger.info( 'debug :: analysing - %s, %s, %s, %s, %s, %s, %s, %s, %s, %s' % ( metric_name, str(metric_expiration_time), str(metric_min_average), str(metric_min_average_seconds), str(metric_trigger), str(alert_threshold), metric_alerters, autoaggregate, autoaggregate_value, algorithm) ) # Dump the the timeseries data to a file timeseries_dump_dir = "/tmp/skyline/boundary/" + algorithm self.mkdir_p(timeseries_dump_dir) timeseries_dump_file = timeseries_dump_dir + "/" + metric_name + ".json" with open(timeseries_dump_file, 'w+') as f: f.write(str(timeseries)) f.close() # Check if a metric has its own unique BOUNDARY_METRICS alert # tuple, this allows us to paint an entire metric namespace with # the same brush AND paint a unique metric or namespace with a # different brush or scapel has_unique_tuple = False run_tupple = False boundary_metric_tuple = (base_name, algorithm, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters) wildcard_namespace = True for metric_tuple in BOUNDARY_METRICS: if not has_unique_tuple: CHECK_MATCH_PATTERN = metric_tuple[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) pattern_match = check_match_pattern.match(base_name) if pattern_match: if metric_tuple[0] == base_name: wildcard_namespace = False if not has_unique_tuple: if boundary_metric_tuple == metric_tuple: has_unique_tuple = True run_tupple = True if ENABLE_BOUNDARY_DEBUG: logger.info('unique_tuple:') logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple)) logger.info('metric_tuple: %s' % str(metric_tuple)) if not has_unique_tuple: if wildcard_namespace: if ENABLE_BOUNDARY_DEBUG: logger.info('wildcard_namespace:') logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple)) run_tupple = True else: if ENABLE_BOUNDARY_DEBUG: logger.info('wildcard_namespace: BUT WOULD NOT RUN') logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple)) if ENABLE_BOUNDARY_DEBUG: logger.info('WOULD RUN run_selected_algorithm = %s' % run_tupple) if run_tupple: # Submit the timeseries and settings to run_selected_algorithm anomalous, ensemble, datapoint, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm = run_selected_algorithm( timeseries, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, autoaggregate, autoaggregate_value, algorithm ) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: analysed - %s' % (metric_name)) else: anomalous = False if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: more unique metric tuple not analysed - %s' % (metric_name)) # If it's anomalous, add it to list if anomalous: anomalous_metric = [datapoint, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm] self.anomalous_metrics.append(anomalous_metric) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # If Crucible or Panorama are enabled determine details determine_anomaly_details = False if settings.ENABLE_CRUCIBLE and settings.BOUNDARY_CRUCIBLE_ENABLED: determine_anomaly_details = True if settings.PANORAMA_ENABLED: determine_anomaly_details = True if determine_anomaly_details: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') # If Panorama is enabled - create a Panorama check if settings.PANORAMA_ENABLED: # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used results in a shift in the # decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 added_at = str(int(time())) source = 'graphite' panaroma_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = [\'%s\']\n' \ 'triggered_algorithms = [\'%s\']\n' \ 'app = \'%s\'\n' \ 'source = \'%s\'\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(algorithm), str(algorithm), skyline_app, source, this_host, added_at) # Create an anomaly file with details about the anomaly panaroma_anomaly_file = '%s/%s.%s.txt' % ( settings.PANORAMA_CHECK_PATH, added_at, base_name) try: write_data_to_file( skyline_app, panaroma_anomaly_file, 'w', panaroma_anomaly_data) logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file)) except: logger.error('error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file)) logger.info(traceback.format_exc()) # If crucible is enabled - save timeseries and create a # crucible check if settings.ENABLE_CRUCIBLE and settings.BOUNDARY_CRUCIBLE_ENABLED: crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp if not os.path.exists(crucible_anomaly_dir): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(crucible_anomaly_dir, mode_arg) # Note: # Due to only one algorithm triggering here the # algorithm related arrays here are a different format # to there output format in analyzer # Note: # The value is enclosed is single quoted intentionally # as the imp.load_source used in crucible results in a # shift in the decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 crucible_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'anomaly_dir = \'%s\'\n' \ 'graphite_metric = True\n' \ 'run_crucible_tests = False\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(algorithm), triggered_algorithms, crucible_anomaly_dir, skyline_app, metric_timestamp) # Create an anomaly file with details about the anomaly crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, base_name) with open(crucible_anomaly_file, 'w') as fh: fh.write(crucible_anomaly_data) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(crucible_anomaly_file, mode_arg) logger.info('added crucible anomaly file :: %s/%s.txt' % (crucible_anomaly_dir, base_name)) # Create timeseries json file with the timeseries json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name) timeseries_json = str(timeseries).replace('[', '(').replace(']', ')') with open(json_file, 'w') as fh: # timeseries fh.write(timeseries_json) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(json_file, mode_arg) logger.info('added crucible timeseries file :: %s/%s.json' % (crucible_anomaly_dir, base_name)) # Create a crucible check file crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name) with open(crucible_check_file, 'w') as fh: fh.write(crucible_anomaly_data) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(crucible_check_file, mode_arg) logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp)) # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 except TooShort: exceptions['TooShort'] += 1 except Stale: exceptions['Stale'] += 1 except Boring: exceptions['Boring'] += 1 except: exceptions['Other'] += 1 logger.info("exceptions['Other'] traceback follows:") logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.exceptions_q.put((key, value))
def on_demand_motif_analysis(metric, timestamp, similarity, batch_size, top_matches, max_distance, range_padding, max_area_percent_diff): """ Process a motif similarity search on demand """ import numpy as np import mass_ts as mts logger = logging.getLogger(skyline_app_logger) dev_null = None function_str = 'on_demand_motif_analysis' logger.info( '%s :: with parameters :: metric: %s, timestamp: %s, similarity: %s, batch_size:%s, top_matches: %s, max_distance: %s, range_padding: %s, max_area_percent_diff: %s' % (function_str, str(metric), str(timestamp), str(similarity), str(batch_size), str(top_matches), str(max_distance), str(range_padding), str(max_area_percent_diff))) trace = 'none' fail_msg = 'none' start = time.time() start_timer = timer() metric_vars_dict = {} metric_id = 0 fp_ids = [] timeseries = [] not_similar_enough_sample = 0 not_similar_motifs = 0 similar_motifs = 0 exact_motifs = 0 distance_motifs = 0 motifs_found = [] find_exact_matches_run = False exact_matches_found = [] fps_timeseries = {} # A motif_analysis dict to add to and return motif_analysis = {} motif_analysis[metric] = {} motif_analysis[metric]['timestamp'] = int(timestamp) motif_analysis[metric]['started'] = start motif_analysis[metric]['motifs'] = {} motif_analysis[metric]['exact_motifs'] = exact_motifs motif_analysis[metric]['similar_motifs'] = similar_motifs motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs motif_analysis[metric][ 'not_similar_enough_sample'] = not_similar_enough_sample # @added 20210417 - Feature #4014: Ionosphere - inference # Allow the user to define the batch_size per similarity search motif_analysis[metric]['batch_size'] = int(batch_size) motif_analysis[metric]['top_matches'] = int(top_matches) motif_analysis[metric]['max_distance'] = float(max_distance) # @added 20210425 - Feature #4014: Ionosphere - inference # Added max_area_percent_diff for computing the area under the curve motif_analysis[metric]['max_area_percent_diff'] = float( max_area_percent_diff) fps_checked_for_motifs = [] metric_dir = metric.replace('.', '/') metric_timeseries_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER, str(timestamp), metric_dir) # @added 20210418 - Feature #4014: Ionosphere - inference # Allow for the similarity search on saved_training_data if 'saved_training_data' in request.args: saved_training_data_str = request.args.get('saved_training_data', 'false') if saved_training_data_str == 'true': saved_metric_timeseries_dir = '%s_saved/%s/%s' % ( settings.IONOSPHERE_DATA_FOLDER, str(timestamp), metric_dir) if path.exists(saved_metric_timeseries_dir): metric_timeseries_dir = saved_metric_timeseries_dir logger.info('%s :: using saved training_data dir - %s' % (function_str, saved_metric_timeseries_dir)) metric_vars_file = '%s/%s.txt' % (metric_timeseries_dir, metric) timeseries_json = '%s/%s.json' % (metric_timeseries_dir, metric) full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60) full_duration_timeseries_json = '%s/%s.mirage.redis.%sh.json' % ( metric_timeseries_dir, metric, str(full_duration_in_hours)) try: metric_vars_dict = mirage_load_metric_vars(skyline_app, metric_vars_file, True) except Exception as e: logger.error( 'error :: inference :: failed to load metric variables from check file - %s - %s' % (metric_vars_file, e)) if not metric_vars_dict: motif_analysis[metric]['status'] = 'error' motif_analysis[metric][ 'reason'] = 'could not load training data variables' return motif_analysis full_duration = metric_vars_dict['metric_vars']['full_duration'] # Determine the metric details from the database metric_id = 0 metric_db_object = {} try: metric_db_object = get_metrics_db_object(metric) except Exception as e: logger.error('error :: %s :: failed to get_metrics_db_object - %s' % (function_str, e)) try: metric_id = int(metric_db_object['id']) except Exception as e: logger.error( 'error :: %s :: failed to determine metric_id from metric_db_object %s - %s' % (function_str, str(metric_db_object), e)) metric_id = 0 if not metric_id: logger.error( 'error :: %s :: failed to get metric id for %s from the database' % (function_str, str(metric))) fail_msg = 'failed to get metric id' motif_analysis[metric]['status'] = 'error' motif_analysis[metric]['reason'] = 'could not determine metric id' return motif_analysis, fail_msg, trace # @modified 20210419 - Feature #4014: Ionosphere - inference # Create a unique dir for each batch_size max_distance # motif_images_dir = '%s/motifs' % metric_timeseries_dir motif_images_dir = '%s/motifs/batch_size.%s/top_matches.%s/max_distance.%s' % ( metric_timeseries_dir, str(batch_size), str(top_matches), str(max_distance)) if not path.exists(motif_images_dir): # provision motifs image resources mkdir_p(motif_images_dir) full_durations = [full_duration] if path.isfile(full_duration_timeseries_json): full_durations = [full_duration, settings.FULL_DURATION] logger.info('%s :: full_durations - %s' % (function_str, str(full_durations))) # Loop through analysis per full_duration for full_duration in full_durations: start_full_duration = timer() fp_ids = [] try: query = 'SELECT id,last_matched from ionosphere WHERE metric_id=%s AND full_duration=%s AND enabled=1 ORDER BY last_matched DESC' % ( str(metric_id), str(full_duration)) results = mysql_select(skyline_app, query) for row in results: fp_ids.append(int(row[0])) except Exception as e: logger.error( 'error :: %s :: failed to get fp ids via mysql_select from %s - %s' % (function_str, metric, e)) logger.info('%s :: metric_id: %s, full_duration: %s, fp_ids: %s' % (function_str, (metric_id), str(full_duration), str(fp_ids))) if not fp_ids: continue # Now there are known fps, load the timeseries if full_duration == settings.FULL_DURATION: timeseries_json_file = full_duration_timeseries_json else: timeseries_json_file = timeseries_json try: with open((timeseries_json_file), 'r') as f: raw_timeseries = f.read() timeseries_array_str = str(raw_timeseries).replace('(', '[').replace( ')', ']') del raw_timeseries timeseries = literal_eval(timeseries_array_str) del timeseries_array_str except Exception as e: logger.error( 'error :: %s :: failed to load timeseries for %s from %s - %s' % (function_str, metric, timeseries_json_file, e)) continue anomalous_timeseries_subsequence = [] for timestamp_float, value in timeseries[-int(batch_size):]: anomalous_timeseries_subsequence.append( [int(timestamp_float), value]) logger.info( '%s :: looking for motif in trained fps of full_duration: %s' % (function_str, (full_duration))) dataset = [float(item[1]) for item in anomalous_timeseries_subsequence] max_y = max(dataset) min_y = min(dataset) # full_y_range = max_y - min_y # range_padding_percent = range_padding # This was just a test that did not have the desired results # if full_y_range < 10: # range_padding_percent = 35 # if full_y_range < 5: # range_padding_percent = 75 # if full_y_range < 2: # range_padding_percent = 100 use_range_padding = ((max_y - min_y) / 100) * range_padding if min_y > 0 and (min_y - use_range_padding) > 0: min_y_padded = min_y - use_range_padding else: min_y_padded = min_y max_y_padded = max_y + use_range_padding if min_y_padded == max_y_padded: min_y_padded = min_y_padded - ( (min_y_padded / 100) * range_padding) max_y_padded = max_y_padded + ( (max_y_padded / 100) * range_padding) # anomalous_ts = np.array(dataset) anomalous_ts = dataset mass2_batch_times = [] exact_match_times = [] nan = np.array([np.nan]) nanj = complex(0.0, float('nan')) empty_dists = np.array(nan + nanj) # plotted = False count = 0 # fp_ids = [fp_id for index, fp_id in enumerate(fp_ids) if index == 0] # motifs_found = [] # exact_matches_found = [] # fps_timeseries = {} for fp_id in fp_ids: if (time.time() - start) >= 20: break # Attempt to surface the fp timeseries from memcache and/or db # @modified 20210424 - Feature #4014: Ionosphere - inference # Task #4030: refactoring fp_timeseries = None try: fp_timeseries = get_fp_timeseries(skyline_app, metric_id, fp_id) except Exception as e: logger.error( 'inference :: did not get fp timeseries with get_fp_timeseries(%s, %s, %s) - %s' % (skyline_app, str(metric_id), str(fp_id), e)) if not fp_timeseries: continue relate_dataset = [float(item[1]) for item in fp_timeseries] fps_timeseries[fp_id] = fp_timeseries current_best_indices = [] current_best_dists = [] best_indices = None best_dists = None try: logger.info( '%s :: running mts.mass2_batch fp_id: %s, full_duration: %s, batch_size: %s, top_matches: %s, max_distance: %s, motif_size: %s' % (function_str, str(fp_id), str(full_duration), str(batch_size), str(top_matches), str(max_distance), str(len(anomalous_ts)))) # @added 20210418 - Feature #4014: Ionosphere - inference # Handle top_matches being greater than possible kth that can be found # mts.mass2_batch error: kth(=50) out of bounds (16) use_top_matches = int(top_matches) if (len(fp_timeseries) / int(batch_size)) <= int(top_matches): use_top_matches = round( len(fp_timeseries) / int(batch_size)) - 1 if use_top_matches == 2: use_top_matches = 1 logger.info( '%s :: adjusting top_matches to %s (the maximum possible top - 1) as kth(=%s) will be out of bounds mts.mass2_batch' % (function_str, str(use_top_matches), str(top_matches))) start_mass2_batch = timer() # @modified 20210418 - Feature #4014: Ionosphere - inference # Handle top_matches being greater than possible kth that can be found # best_indices, best_dists = mts.mass2_batch(relate_dataset, anomalous_ts, batch_size=int(batch_size), top_matches=int(top_matches)) best_indices, best_dists = mts.mass2_batch( relate_dataset, anomalous_ts, batch_size=int(batch_size), top_matches=int(use_top_matches)) end_mass2_batch = timer() mass2_batch_times.append((end_mass2_batch - start_mass2_batch)) current_best_indices = best_indices.tolist() current_best_dists = best_dists.tolist() # @added 20210412 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Add fp_id to fps_checked_for_motifs to enable ionosphere to update the # motif related columns in the ionosphere database table fps_checked_for_motifs.append(fp_id) except Exception as e: logger.error('error :: %s :: %s mts.mass2_batch error: %s' % (function_str, (fp_id), str(e))) continue try: if str(list(best_dists)) == str(list(empty_dists)): logger.info( '%s :: mts.mass2_batch no similar motif from fp id %s - best_dists: %s' % (function_str, (fp_id), str(list(best_dists)))) continue except Exception as e: dev_null = e if not current_best_indices[0]: continue # if list(best_indices)[0] != anomalous_index: # continue # If the best_dists is > 1 they are not very similar # if list(best_dists)[0].real > 1.0: # continue # if list(best_indices)[0] and best_dists: for index, best_dist in enumerate(current_best_dists): try: motif_added = False """ Note: mass_ts finds similar motifs NOT the same motif, the same motif will result in the best_dists being a nan+nanj So it is DIYed """ try: # @modified 20210414 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Store the not anomalous motifs # motif = [fp_id, current_best_indices[index], best_dist.real] motif = [ fp_id, current_best_indices[index], best_dist.real, anomalous_timeseries_subsequence, full_duration ] except Exception as e: dev_null = e motif = [] # if list(best_indices)[0] and best_dists: # If it is greater than 1.0 it is not similar # if best_dist.real > 1.0: # if best_dist.real > IONOSPHERE_INFERENCE_MASS_TS_MAX_DISTANCE: if best_dist.real > float(max_distance): continue else: if motif: count += 1 motifs_found.append(motif) motif_added = True if not motif_added: if best_dist == nanj: count += 1 motifs_found.append(motif) motif_added = True if not motif_added: if str(best_dist) == 'nan+nanj': count += 1 motifs_found.append([ fp_id, current_best_indices[index], 0.0, anomalous_timeseries_subsequence, full_duration ]) motif_added = True if not motif_added: if best_dist == empty_dists: count += 1 motifs_found.append(motif) motif_added = True except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: %s :: could not determine is if fp id %s timeseries at index %s was a match - %s' % (function_str, str(fp_id), str(current_best_indices[index]), e)) continue # FIND EXACT MATCHES # Seeing as I cannot reproduce finding nan+nanj which represents an # exact match with mts.mass2_batch, do it DIY style - iterate the # timeseries and create a batch_size subsequence for every index and # compare the values to the anomalous_ts for an exact match. # This takes ~0.024850 seconds on a timeseries with 10079 datapoints # @modified 20210418 - Feature #4014: Ionosphere - inference # However fiding exact matches can add ~2.5 seconds on 90 minute # batch_size and with a proproptionally scaled max_distance of say 15 # finding an exact match in a longer sequence is less important, # the greater the batch_size the most likely greater the variability # and the chance of an exact match decreases. So save 2.5 seconds. # UNLESS # At a 5 (to 10) batch_size and max_distance of 1.0 an exact match # can be found. Exact matches are quite frequent and sometimes with # such little variability, similar matchs may not be found. # Therefore find find_exact_matches has its place. MASS # A CAVEAT here is that boring metrics and that change and have a # low variability even at a larger batch_size could also benefit and # possibly achieve better accruracy from the use of find_exact_matches # as they can be shapelets resemble a batch_size 5 shapelet. # It would perhaps be possible to use one or more of the features # profile tsfresh values to identify these types of shapelets, if # you knew which feature/s were most descriptive of this type of # shapelet, e.g. 'value__skewness': 3.079477685394873, etc (maybe) # However I predict that this method will perform worst on these # types of shapelets. # find_exact_matches = False # exact matches can be found in batch sizes of 500 and similar not # So actually always run it. find_exact_matches = True find_exact_matches_run = True if int(batch_size) < 10: find_exact_matches = True find_exact_matches_run = True if find_exact_matches: try: start_exact_match = timer() indexed_relate_dataset = [] for index, item in enumerate(relate_dataset): indexed_relate_dataset.append([index, item]) last_index = indexed_relate_dataset[-1][0] current_index = 0 while current_index < last_index: subsequence = [ value for index, value in indexed_relate_dataset[current_index:( current_index + int(batch_size))] ] if subsequence == anomalous_ts: exact_matches_found.append([ fp_id, current_index, 0.0, anomalous_timeseries_subsequence, full_duration ]) motifs_found.append([ fp_id, current_index, 0.0, anomalous_timeseries_subsequence, full_duration ]) current_index += 1 end_exact_match = timer() exact_match_times.append( (end_exact_match - start_exact_match)) except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: %s :: could not determine it any exact matches could be found in fp id %s timeseries - %s' % (function_str, str(fp_id), e)) logger.info( '%s :: mts.mass2_batch runs on %s fps of full_duration %s in %.6f seconds' % (function_str, str(len(mass2_batch_times)), str(full_duration), sum(mass2_batch_times))) if find_exact_matches_run: logger.info( '%s :: exact_match runs on %s fps of full_duration %s in %.6f seconds' % (function_str, str(len(exact_match_times)), str(full_duration), sum(exact_match_times))) end_full_duration = timer() logger.info( '%s :: analysed %s fps of full_duration %s in %.6f seconds' % (function_str, str(len(fp_ids)), str(full_duration), (end_full_duration - start_full_duration))) # Patterns are sorted sorted_motifs = [] motifs_found_in_fps = [] if motifs_found: sorted_motifs = sorted(motifs_found, key=lambda x: x[2]) for item in sorted_motifs: motifs_found_in_fps.append(item[0]) logger.info('%s :: %s motifs found' % (function_str, str(len(sorted_motifs)))) for motif in sorted_motifs: if (time.time() - start) >= 25: break try: add_match = False all_in_range = False fp_id = motif[0] best_index = motif[1] best_dist = motif[2] # @added 20210414 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Store the not anomalous motifs motif_sequence = motif[3] motif_full_duration = motif[4] match_type = 'not_similar_enough' if motif in exact_matches_found: add_match = True match_type = 'exact' all_in_range = True exact_motifs += 1 full_relate_timeseries = fps_timeseries[fp_id] # full_relate_dataset = [float(item[1]) for item in full_relate_timeseries] relate_timeseries = [ item for index, item in enumerate(full_relate_timeseries) if index >= best_index and index < (best_index + int(batch_size)) ] relate_dataset = [item[1] for item in relate_timeseries] if not add_match: all_in_range = True for value in relate_dataset: if value < min_y_padded: all_in_range = False break if value > max_y_padded: all_in_range = False break if all_in_range: related_max_y = max(relate_dataset) if related_max_y < (max_y - range_padding): all_in_range = False if related_max_y > (max_y + range_padding): all_in_range = False related_min_y = min(relate_dataset) if related_min_y < (min_y - range_padding): all_in_range = False if related_min_y > (min_y + range_padding): all_in_range = False if all_in_range: logger.info( '%s :: ALL IN RANGE - all_in_range: %s, motif: %s' % (function_str, str(all_in_range), str(relate_dataset[0:2]))) add_match = True match_type = 'all_in_range' similar_motifs += 1 # @added 20210425 - Feature #4014: Ionosphere - inference # Compute the area using the composite trapezoidal rule. motif_area = None fp_motif_area = None percent_different = None try: batch_size_dataset = [ float(item[1]) for item in motif_sequence ] y_motif = np.array(batch_size_dataset) motif_area = np.trapz(y_motif, dx=1) except Exception as e: logger.error( 'error :: %s :: failed to get motif_area with np.trapz - %s' % (function_str, e)) try: y_fp_motif = np.array(relate_dataset) fp_motif_area = np.trapz(y_fp_motif, dx=1) except Exception as e: logger.error( 'error :: %s :: failed to get fp_motif_area with np.trapz - %s' % (function_str, e)) # Determine the percentage difference (as a # positive value) of the areas under the # curves. if motif_area and fp_motif_area: percent_different = get_percent_different( fp_motif_area, motif_area, True) if percent_different > max_area_percent_diff: if add_match: logger.info( '%s :: AREA TOO DIFFERENT - not adding all_in_range match' % (function_str)) add_match = False # BUT ... if best_dist < 3 and not add_match: logger.info( '%s :: DISTANCE VERY SIMILAR - adding match even though area_percent_diff is greater than max_area_percent_diff because best_dist: %s' % (function_str, str(best_dist))) add_match = True match_type = 'distance' distance_motifs += 1 if similarity == 'all': if not add_match: not_similar_motifs += 1 if not_similar_enough_sample >= 10: continue not_similar_enough_sample += 1 add_match = True match_type = 'not_similar_enough' if add_match: generation = 0 fp_id_row = None try: fp_id_row = get_ionosphere_fp_db_row( skyline_app, int(fp_id)) except Exception as e: logger.error( 'error :: %s :: failed to get_ionosphere_fp_db_row for fp_id %s - %s' % (function_str, str(fp_id), e)) if fp_id_row: try: generation = fp_id_row['generation'] except Exception as e: logger.error( 'error :: %s :: failed to generation from fp_id_row for fp_id %s - %s' % (function_str, str(fp_id), e)) if generation == 0: generation_str = 'trained' else: generation_str = 'LEARNT' motif_match_types = motif_match_types_dict() type_id = motif_match_types[match_type] motif_id = '%s-%s' % (str(fp_id), str(best_index)) motif_analysis[metric]['motifs'][motif_id] = {} motif_analysis[metric]['motifs'][motif_id][ 'metric_id'] = metric_id motif_analysis[metric]['motifs'][motif_id]['fp_id'] = fp_id motif_analysis[metric]['motifs'][motif_id][ 'generation'] = generation motif_analysis[metric]['motifs'][motif_id][ 'index'] = best_index motif_analysis[metric]['motifs'][motif_id][ 'distance'] = best_dist motif_analysis[metric]['motifs'][motif_id]['size'] = int( batch_size) motif_analysis[metric]['motifs'][motif_id][ 'max_distance'] = float(max_distance) motif_analysis[metric]['motifs'][motif_id][ 'timestamp'] = timestamp motif_analysis[metric]['motifs'][motif_id][ 'type_id'] = type_id motif_analysis[metric]['motifs'][motif_id][ 'type'] = match_type motif_analysis[metric]['motifs'][motif_id][ 'full_duration'] = motif_full_duration # @added 20210414 - Feature #4014: Ionosphere - inference # Branch #3590: inference # Store the not anomalous motifs motif_analysis[metric]['motifs'][motif_id][ 'motif_timeseries'] = anomalous_timeseries_subsequence motif_analysis[metric]['motifs'][motif_id][ 'motif_sequence'] = motif_sequence not_anomalous_timestamp = int( anomalous_timeseries_subsequence[-1][0]) graph_period_seconds = not_anomalous_timestamp - int( anomalous_timeseries_subsequence[0][0]) motif_analysis[metric]['motifs'][motif_id][ 'motif_period_seconds'] = graph_period_seconds motif_analysis[metric]['motifs'][motif_id][ 'motif_period_minutes'] = round(graph_period_seconds / 60) motif_analysis[metric]['motifs'][motif_id]['image'] = None motif_analysis[metric]['motifs'][motif_id][ 'motif_area'] = motif_area motif_analysis[metric]['motifs'][motif_id][ 'fp_motif_area'] = fp_motif_area motif_analysis[metric]['motifs'][motif_id][ 'area_percent_diff'] = percent_different motif_analysis[metric]['motifs'][motif_id][ 'max_area_percent_diff'] = max_area_percent_diff if (time.time() - start) >= 25: continue graph_image_file = '%s/motif.%s.%s.%s.with_max_distance.%s.png' % ( motif_images_dir, motif_id, match_type, str(batch_size), str(max_distance)) plotted_image = False on_demand_motif_analysis = True if not path.isfile(graph_image_file): plotted_image, plotted_image_file = plot_motif_match( skyline_app, metric, timestamp, fp_id, full_duration, generation_str, motif_id, best_index, int(batch_size), best_dist, type_id, relate_dataset, anomalous_timeseries_subsequence, graph_image_file, on_demand_motif_analysis) else: plotted_image = True logger.info('%s :: plot already exists - %s' % (function_str, str(graph_image_file))) if plotted_image: motif_analysis[metric]['motifs'][motif_id][ 'image'] = graph_image_file else: logger.error('failed to plot motif match plot') graph_image_file = None except Exception as e: logger.error(traceback.format_exc()) logger.error( 'error :: inference :: with fp id %s proceesing motif at index: %s - %s' % (str(fp_id), str(motif[0]), str(e))) continue end_timer = timer() motif_analysis[metric]['fps_checked'] = fps_checked_for_motifs motif_analysis[metric]['exact_motifs'] = exact_motifs motif_analysis[metric]['similar_motifs'] = similar_motifs motif_analysis[metric]['distance_motifs'] = distance_motifs motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs motif_analysis[metric][ 'not_similar_enough_sample'] = not_similar_enough_sample motif_analysis_file = '%s/motif.analysis.similarity_%s.batch_size_%s.top_matches_%s.max_distance_%s.dict' % ( motif_images_dir, similarity, str(batch_size), str(top_matches), str(max_distance)) try: write_data_to_file(skyline_app, motif_analysis_file, 'w', str(motif_analysis)) except Exception as e: trace = traceback.format_exc() logger.error('%s' % trace) fail_msg = '%s :: error :: failed to write motif_analysis_file - %s' % ( function_str, motif_analysis_file) logger.error('%s' % fail_msg) dev_null = e motif_ids = list(motif_analysis[metric]['motifs'].keys()) logger.info( '%s :: %s motif matches found, %s fps where checked and motifs plotted in %.6f seconds for %s' % (function_str, str(len(motif_ids)), str(len(fps_checked_for_motifs)), (end_timer - start_timer), metric)) if dev_null: del dev_null return motif_analysis, fail_msg, trace
def spin_process(self, i, run_timestamp): """ Assign a metric for a process to analyze. """ # Discover metric to analyze metric_var_files = [ f for f in listdir(settings.MIRAGE_CHECK_PATH) if isfile(join(settings.MIRAGE_CHECK_PATH, f)) ] # Check if this process is unnecessary if len(metric_var_files) == 0: return metric_var_files_sorted = sorted(metric_var_files) metric_check_file = '%s/%s' % (settings.MIRAGE_CHECK_PATH, str(metric_var_files_sorted[0])) # Load metric variables # @modified 20160822 - Bug #1460: panorama check file fails # Changed to panorama style skyline_functions load_metric_vars # self.load_metric_vars(metric_check_file) # Load and validate metric variables try: metric_vars = load_metric_vars(skyline_app, str(metric_check_file)) except: logger.info(traceback.format_exc()) logger.error( 'error :: failed to load metric variables from check file - %s' % (metric_check_file)) fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file)) return # Test metric variables if len(metric_vars.metric) == 0: return else: metric = metric_vars.metric metric_name = ['metric_name', metric_vars.metric] self.metric_variables.append(metric_name) if len(metric_vars.value) == 0: return else: metric_value = ['metric_value', metric_vars.value] self.metric_variables.append(metric_value) if len(metric_vars.hours_to_resolve) == 0: return else: hours_to_resolve = [ 'hours_to_resolve', metric_vars.hours_to_resolve ] self.metric_variables.append(hours_to_resolve) if len(metric_vars.metric_timestamp) == 0: return else: metric_timestamp = [ 'metric_timestamp', metric_vars.metric_timestamp ] self.metric_variables.append(metric_timestamp) # Ignore any metric check with a timestamp greater than 10 minutes ago int_metric_timestamp = int(metric_vars.metric_timestamp) int_run_timestamp = int(run_timestamp) metric_timestamp_age = int_run_timestamp - int_metric_timestamp if metric_timestamp_age > settings.MIRAGE_STALE_SECONDS: logger.info( 'stale check :: %s check request is %s seconds old - discarding' % (metric_vars.metric, metric_timestamp_age)) # Remove metric check file # try: # os.remove(metric_check_file) # except OSError: # pass # return if os.path.exists(metric_check_file): os.remove(metric_check_file) logger.info('removed %s' % (metric_check_file)) else: logger.info('could not remove %s' % (metric_check_file)) # Calculate hours second order resolution to seconds second_order_resolution_seconds = int( metric_vars.hours_to_resolve) * 3600 # Calculate graphite from and until parameters from the metric timestamp graphite_until = datetime.datetime.fromtimestamp( int(metric_vars.metric_timestamp)).strftime('%H:%M_%Y%m%d') int_second_order_resolution_seconds = int( second_order_resolution_seconds) second_resolution_timestamp = int_metric_timestamp - int_second_order_resolution_seconds graphite_from = datetime.datetime.fromtimestamp( int(second_resolution_timestamp)).strftime('%H:%M_%Y%m%d') # Remove any old json file related to the metric metric_json_file = '%s/%s/%s.json' % (settings.MIRAGE_DATA_FOLDER, str(metric_vars.metric), str(metric_vars.metric)) try: os.remove(metric_json_file) except OSError: pass # Get data from graphite logger.info( 'retrieve data :: surfacing %s timeseries from graphite for %s seconds' % (metric_vars.metric, second_order_resolution_seconds)) self.surface_graphite_metric_data(metric_vars.metric, graphite_from, graphite_until) # Check there is a json timeseries file to test if not os.path.isfile(metric_json_file): logger.error( 'error :: retrieve failed - failed to surface %s timeseries from graphite' % (metric_vars.metric)) # Remove metric check file try: os.remove(metric_check_file) except OSError: pass return else: logger.info('retrieved data :: for %s at %s seconds' % (metric_vars.metric, second_order_resolution_seconds)) # Make process-specific dicts exceptions = defaultdict(int) anomaly_breakdown = defaultdict(int) self.check_if_parent_is_alive() with open((metric_json_file), 'r') as f: timeseries = json.loads(f.read()) logger.info('data points surfaced :: %s' % (len(timeseries))) try: logger.info('analyzing :: %s at %s seconds' % (metric_vars.metric, second_order_resolution_seconds)) anomalous, ensemble, datapoint = run_selected_algorithm( timeseries, metric_vars.metric, second_order_resolution_seconds) # If it's anomalous, add it to list if anomalous: base_name = metric.replace(settings.FULL_NAMESPACE, '', 1) anomalous_metric = [datapoint, base_name] self.anomalous_metrics.append(anomalous_metric) logger.info('anomaly detected :: %s with %s' % (metric_vars.metric, metric_vars.value)) # It runs so fast, this allows us to process 30 anomalies/min sleep(2) # Get the anomaly breakdown - who returned True? triggered_algorithms = [] for index, value in enumerate(ensemble): if value: algorithm = settings.MIRAGE_ALGORITHMS[index] anomaly_breakdown[algorithm] += 1 triggered_algorithms.append(algorithm) # If Crucible or Panorama are enabled determine details determine_anomaly_details = False if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED: determine_anomaly_details = True if settings.PANORAMA_ENABLED: determine_anomaly_details = True if determine_anomaly_details: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') # If Panorama is enabled - create a Panorama check if settings.PANORAMA_ENABLED: if not os.path.exists(settings.PANORAMA_CHECK_PATH): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode = 0o755 os.makedirs(settings.PANORAMA_CHECK_PATH, mode_arg) # Note: # The values are enclosed is single quoted intentionally # as the imp.load_source used results in a shift in the # decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 added_at = str(int(time())) source = 'graphite' panaroma_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'app = \'%s\'\n' \ 'source = \'%s\'\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.MIRAGE_ALGORITHMS), triggered_algorithms, skyline_app, source, this_host, added_at) # Create an anomaly file with details about the anomaly panaroma_anomaly_file = '%s/%s.%s.txt' % ( settings.PANORAMA_CHECK_PATH, added_at, base_name) try: write_data_to_file(skyline_app, panaroma_anomaly_file, 'w', panaroma_anomaly_data) logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file)) except: logger.error( 'error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file)) logger.info(traceback.format_exc()) # If crucible is enabled - save timeseries and create a # crucible check if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED: metric_timestamp = str(int(timeseries[-1][0])) from_timestamp = str(int(timeseries[1][0])) timeseries_dir = base_name.replace('.', '/') crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp if not os.path.exists(crucible_anomaly_dir): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode = 0o755 os.makedirs(crucible_anomaly_dir, mode_arg) # Note: # The value is enclosed is single quoted intentionally # as the imp.load_source used in crucible results in a # shift in the decimal position when double quoted, e.g. # value = "5622.0" gets imported as # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2 # single quoting results in the desired, # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0 crucible_anomaly_data = 'metric = \'%s\'\n' \ 'value = \'%s\'\n' \ 'from_timestamp = \'%s\'\n' \ 'metric_timestamp = \'%s\'\n' \ 'algorithms = %s\n' \ 'triggered_algorithms = %s\n' \ 'anomaly_dir = \'%s\'\n' \ 'graphite_metric = True\n' \ 'run_crucible_tests = False\n' \ 'added_by = \'%s\'\n' \ 'added_at = \'%s\'\n' \ % (base_name, str(datapoint), from_timestamp, metric_timestamp, str(settings.MIRAGE_ALGORITHMS), triggered_algorithms, crucible_anomaly_dir, skyline_app, metric_timestamp) # Create an anomaly file with details about the anomaly crucible_anomaly_file = '%s/%s.txt' % ( crucible_anomaly_dir, base_name) try: write_data_to_file(skyline_app, crucible_anomaly_file, 'w', crucible_anomaly_data) logger.info('added crucible anomaly file :: %s' % (crucible_anomaly_file)) except: logger.error( 'error :: failed to add crucible anomaly file :: %s' % (crucible_anomaly_file)) logger.info(traceback.format_exc()) # Create timeseries json file with the timeseries json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name) timeseries_json = str(timeseries).replace('[', '(').replace( ']', ')') try: write_data_to_file(skyline_app, json_file, 'w', timeseries_json) logger.info('added crucible timeseries file :: %s' % (json_file)) except: logger.error( 'error :: failed to add crucible timeseries file :: %s' % (json_file)) logger.info(traceback.format_exc()) # Create a crucible check file crucible_check_file = '%s/%s.%s.txt' % ( settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name) try: write_data_to_file(skyline_app, crucible_check_file, 'w', crucible_anomaly_data) logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp)) except: logger.error( 'error :: failed to add crucible check file :: %s' % (crucible_check_file)) logger.info(traceback.format_exc()) else: base_name = metric.replace(settings.FULL_NAMESPACE, '', 1) not_anomalous_metric = [datapoint, base_name] self.not_anomalous_metrics.append(not_anomalous_metric) logger.info('not anomalous :: %s with %s' % (metric_vars.metric, metric_vars.value)) # It could have been deleted by the Roomba except TypeError: exceptions['DeletedByRoomba'] += 1 logger.info('exceptions :: DeletedByRoomba') except TooShort: exceptions['TooShort'] += 1 logger.info('exceptions :: TooShort') except Stale: exceptions['Stale'] += 1 logger.info('exceptions :: Stale') except Boring: exceptions['Boring'] += 1 logger.info('exceptions :: Boring') except: exceptions['Other'] += 1 logger.info('exceptions :: Other') logger.info(traceback.format_exc()) # Add values to the queue so the parent process can collate for key, value in anomaly_breakdown.items(): self.mirage_anomaly_breakdown_q.put((key, value)) for key, value in exceptions.items(): self.mirage_exceptions_q.put((key, value)) metric_var_files = [] timeseries = [] # Remove metric check file try: os.remove(metric_check_file) except OSError: pass