Пример #1
0
def thunder_send_event(current_skyline_app, event, log=True):
    """
    Add an event to the thunder.events Redis set or the thunder check dir if
    Redis is not available.

    :param current_skyline_app: the app calling the function
    :param event: the event data
    :param log: whether to log or not, optional, defaults to False
    :type current_skyline_app: str
    :type event: dict
    :type log: boolean
    :return: submitted
    :rtype: boolean

    """

    function_str = 'functions.thunder.thunder_sent_event'
    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
    else:
        current_logger = None

    submitted = 0
    try:
        redis_conn = get_redis_conn(current_skyline_app)
        submitted = redis_conn.sadd('thunder.events', str(event))
        if submitted:
            return True
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to add %s to thunder.events Redis set - %s'
            % (function_str, str(event), e))

    # If the thunder event was not added to Redis set, create the event_file
    if not path.exists(THUNDER_EVENTS_DIR):
        mkdir_p(THUNDER_EVENTS_DIR)
        current_logger.info('created dir - %s' % THUNDER_EVENTS_DIR)
    event_file = '%s/%s.thunder.event.dict' % (THUNDER_EVENTS_DIR, str(time()))
    try:
        write_data_to_file(current_skyline_app, event_file, 'w', str(event))
        current_logger.info('added thunder event file - %s' % event_file)
        submitted = True
    except Exception as e:
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: failed to add thunder event file - %s - %s' %
            (event_file, e))
        submitted = False

    return submitted
Пример #2
0
def run_algorithms(timeseries, timeseries_name, end_timestamp, full_duration,
                   timeseries_file, skyline_app, algorithms):
    """
    Iteratively run algorithms.
    """

    results_dir = os.path.dirname(timeseries_file)

    if not os.path.exists(results_dir):
        os.makedirs(results_dir, mode=0o755)

    start_analysis = int(time.time())

    triggered_algorithms = []
    anomalous = False

    check_algorithms = []
    if str(algorithms) == "['all']":
        if skyline_app == 'analyzer':
            check_algorithms = ALGORITHMS
            logger.info('check_algorithms for analyzer - %s' %
                        (str(check_algorithms)))
        if skyline_app == 'mirage':
            check_algorithms = MIRAGE_ALGORITHMS
            logger.info('check_algorithms for mirage - %s' %
                        (str(check_algorithms)))
        if skyline_app == 'boundary':
            check_algorithms = algorithms
            logger.info('check_algorithms for boundary - %s' %
                        (str(check_algorithms)))
        if skyline_app == 'crucible':
            ALGORITHMS.append('detect_drop_off_cliff')
            check_algorithms = ALGORITHMS
            logger.info('check_algorithms for crucible - %s' %
                        (str(check_algorithms)))
    else:
        check_algorithms = algorithms
        logger.info('check_algorithms specified - %s' %
                    (str(check_algorithms)))

    if not check_algorithms:
        logger.info('check_algorithms unknown - %s' % (str(check_algorithms)))
        ALGORITHMS.append('detect_drop_off_cliff')
        check_algorithms = ALGORITHMS
        logger.info('check_algorithms - %s' % (str(check_algorithms)))

    logger.info('checking algorithms - %s' % (str(check_algorithms)))

    # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png
    # Plot Skyline anomalies if CONSENSUS is achieved
    anomalies = []

    for algorithm in check_algorithms:
        detected = ''
        try:
            x_vals = np.arange(len(timeseries))
            y_vals = np.array([y[1] for y in timeseries])
            # Match default graphite graph size
            plt.figure(figsize=(5.86, 3.08), dpi=100)
            plt.plot(x_vals, y_vals)

            # Start a couple datapoints in for the tail average
            for index in range(10, len(timeseries)):
                sliced = timeseries[:index]
                anomaly = globals()[algorithm](sliced, end_timestamp,
                                               full_duration)

                # Point out the datapoint if it's anomalous
                if anomaly:
                    plt.plot([index], [sliced[-1][1]], 'ro')
                    detected = "DETECTED"
                    # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png
                    # Add the anomaly to the anomalies list to plot Skyline
                    # anomalies if CONSENSUS is achieved
                    anomalies.append([sliced[-1][0], sliced[-1][1], algorithm])

            if detected == "DETECTED":
                results_filename = join(results_dir + "/" + algorithm + "." +
                                        detected + ".png")
                #                logger.info('ANOMALY DETECTED :: %s' % (algorithm))
                anomalous = True
                triggered_algorithms.append(algorithm)
            else:
                results_filename = join(results_dir + "/" + algorithm + ".png")

            plt.savefig(results_filename, dpi=100)
            #            logger.info('%s :: %s' % (algorithm, results_filename))
            if python_version == 2:
                os.chmod(results_filename, 0644)
            if python_version == 3:
                os.chmod(results_filename, mode=0o644)
        except:
            logger.error('error :: %s' % (traceback.format_exc()))
            logger.info(
                'info :: error thrown in algorithm running and plotting - %s' %
                (str(algorithm)))

    end_analysis = int(time.time())
    # @modified 20160814 - pyflaked
    # seconds_to_run = end_analysis - start_analysis
    #    logger.info(
    #        'analysis of %s at a full duration of %s took %s seconds' %
    #        (timeseries_name, str(full_duration), str(seconds_to_run)))

    # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png
    # Plot Skyline anomalies where CONSENSUS achieved and create file resources
    # skyline.anomalies_score.txt and skyline.anomalies.csv
    anomalies_score = []
    if anomalies:
        for ts, value, algo in anomalies:
            processed = False
            algorithms_triggered = []
            if anomalies_score:
                for i in anomalies_score:
                    if i[0] == ts:
                        processed = True
                        continue
            if processed:
                continue
            for w_ts, w_value, w_algo in anomalies:
                if w_ts == ts:
                    algorithms_triggered.append(w_algo)
            if algorithms_triggered:
                consensus = len(algorithms_triggered)
                anomalies_score.append(
                    [ts, value, consensus, algorithms_triggered])
        try:
            logger.info('info :: plotting skyline.consensus.anomalies.png')
            x_vals = np.arange(len(timeseries))
            y_vals = np.array([y[1] for y in timeseries])
            # Match default graphite graph size
            plt.figure(figsize=(5.86, 3.08), dpi=100)
            plt.plot(x_vals, y_vals)
            for index in range(10, len(timeseries)):
                anomaly = False
                sliced = timeseries[:index]
                for i in anomalies_score:
                    if sliced[-1][0] == i[0]:
                        if i[2] >= CONSENSUS:
                            anomaly = True
                # Point out the datapoint if it is anomalous according to
                # Skyline CONSENSUS
                if anomaly:
                    plt.plot([index], [sliced[-1][1]], 'ro')
            results_filename = join(results_dir +
                                    "/skyline.consensus.anomalies.png")
            plt.savefig(results_filename, dpi=100)
            if python_version == 2:
                os.chmod(results_filename, 0644)
            if python_version == 3:
                os.chmod(results_filename, mode=0o644)
        except:
            logger.error('error :: %s' % (traceback.format_exc()))
            logger.error(
                'error :: falied plotting skyline.consensus.anomalies.png')
        anomalies_filename = join(results_dir + "/skyline.anomalies_score.txt")
        write_data_to_file(skyline_app, anomalies_filename, 'w',
                           str(anomalies_score))
        anomalies_csv = join(results_dir + "/skyline.anomalies.csv")
        try:
            with open(anomalies_csv, 'w') as fh:
                fh.write(
                    'timstamp,value,consensus_count,triggered_algorithms\n')
            for ts, value, consensus, algorithms_triggered in anomalies_score:
                try:
                    algos_str = str(algorithms_triggered)
                    triggered_algorithms = algos_str.replace(',', ' ')
                    line = '%s,%s,%s,%s\n' % (str(ts), str(value),
                                              str(consensus),
                                              str(triggered_algorithms))
                    with open(anomalies_csv, 'a') as fh:
                        fh.write(line)
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: could not write to file %s' %
                                 (anomalies_csv))
            if python_version == 2:
                os.chmod(anomalies_csv, 0644)
            if python_version == 3:
                os.chmod(anomalies_csv, mode=0o644)
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: could not write to file %s' %
                         (anomalies_csv))

    return anomalous, triggered_algorithms
Пример #3
0
def alert_slack(datapoint, metric_name, expiration_time, metric_trigger,
                algorithm):

    if not settings.SLACK_ENABLED:
        return False

    from slackclient import SlackClient
    metric = metric_name
    logger.info('alert_slack - anomalous metric :: metric: %s - %s' %
                (metric, algorithm))
    base_name = metric
    alert_algo = str(algorithm)
    alert_context = alert_algo.upper()

    # The known_derivative_metric state is determine in case we need to surface
    # the png image from Graphite if the Ionosphere image is not available for
    # some reason.  This will result in Skyline at least still sending an alert
    # to slack, even if some gear fails in Ionosphere or slack alerting is used
    # without Ionosphere enabled. Yes not DRY but multiprocessing and spawn
    # safe.
    known_derivative_metric = False
    try:
        if settings.REDIS_PASSWORD:
            REDIS_ALERTER_CONN = redis.StrictRedis(
                password=settings.REDIS_PASSWORD,
                unix_socket_path=settings.REDIS_SOCKET_PATH)
        else:
            REDIS_ALERTER_CONN = redis.StrictRedis(
                unix_socket_path=settings.REDIS_SOCKET_PATH)
    except:
        logger.error('error :: alert_slack - redis connection failed')
    try:
        derivative_metrics = list(
            REDIS_ALERTER_CONN.smembers('derivative_metrics'))
    except:
        derivative_metrics = []
    redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(base_name))
    if redis_metric_name in derivative_metrics:
        known_derivative_metric = True
    if known_derivative_metric:
        try:
            non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS
        except:
            non_derivative_monotonic_metrics = []
        skip_derivative = in_list(redis_metric_name,
                                  non_derivative_monotonic_metrics)
        if skip_derivative:
            known_derivative_metric = False

    if known_derivative_metric:
        unencoded_graph_title = 'Skyline Boundary - ALERT %s at %s hours - derivative graph - %s' % (
            alert_context, str(graphite_previous_hours), metric)
        slack_title = '*Skyline Boundary - ALERT* %s on %s at %s hours - derivative graph - %s' % (
            alert_context, metric, str(graphite_previous_hours), datapoint)
    else:
        unencoded_graph_title = 'Skyline Boundary - ALERT %s at %s hours - %s' % (
            alert_context, str(graphite_previous_hours), metric)
        slack_title = '*Skyline Boundary - ALERT* %s on %s at %s hours - %s' % (
            alert_context, metric, str(graphite_previous_hours), datapoint)

    graph_title_string = quote(unencoded_graph_title, safe='')
    graph_title = '&title=%s' % graph_title_string

    until_timestamp = int(time())
    target_seconds = int((graphite_previous_hours * 60) * 60)
    from_timestamp = str(until_timestamp - target_seconds)

    graphite_from = dt.datetime.fromtimestamp(
        int(from_timestamp)).strftime('%H:%M_%Y%m%d')
    logger.info('graphite_from - %s' % str(graphite_from))
    graphite_until = dt.datetime.fromtimestamp(
        int(until_timestamp)).strftime('%H:%M_%Y%m%d')
    logger.info('graphite_until - %s' % str(graphite_until))
    # @added 20181025 - Feature #2618: alert_slack
    # Added date and time info so you do not have to mouseover the slack
    # message to determine the time at which the alert came in
    timezone = strftime("%Z", gmtime())
    # @modified 20181029 - Feature #2618: alert_slack
    # Use the standard UNIX data format
    # human_anomaly_time = dt.datetime.fromtimestamp(int(until_timestamp)).strftime('%Y-%m-%d %H:%M:%S')
    human_anomaly_time = dt.datetime.fromtimestamp(
        int(until_timestamp)).strftime('%c')
    slack_time_string = '%s %s' % (human_anomaly_time, timezone)

    if settings.GRAPHITE_PORT != '':
        if known_derivative_metric:
            link = '%s://%s:%s/render/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % (
                settings.GRAPHITE_PROTOCOL,
                settings.GRAPHITE_HOST, settings.GRAPHITE_PORT,
                str(graphite_from), str(graphite_until), metric,
                settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
        else:
            link = '%s://%s:%s/render/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % (
                settings.GRAPHITE_PROTOCOL,
                settings.GRAPHITE_HOST, settings.GRAPHITE_PORT,
                str(graphite_from), str(graphite_until), metric,
                settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
    else:
        if known_derivative_metric:
            link = '%s://%s/render/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % (
                settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
                str(graphite_from), str(graphite_until), metric,
                settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
        else:
            link = '%s://%s/render/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % (
                settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
                str(graphite_from), str(graphite_until), metric,
                settings.GRAPHITE_GRAPH_SETTINGS, graph_title)

    # slack does not allow embedded images, nor will it fetch links behind
    # authentication so Skyline uploads a png graphite image with the message
    image_file = None

    # Fetch the png from Graphite
    try:
        image_data = urllib2.urlopen(link).read()  # nosec
    except urllib2.URLError:
        logger.error(traceback.format_exc())
        logger.error('error :: alert_slack - failed to get image graph')
        logger.error('error :: alert_slack - %s' % str(link))
        image_data = None
    if image_data:
        image_file = '%s/%s.%s.graphite.%sh.png' % (
            settings.SKYLINE_TMP_DIR, base_name, skyline_app,
            str(int(graphite_previous_hours)))
        try:
            write_data_to_file(skyline_app, image_file, 'w', image_data)
            logger.info('alert_slack - added Graphite image :: %s' %
                        (image_file))
        except:
            logger.info(traceback.format_exc())
            logger.error(
                'error :: alert_slack - failed to add %s Graphite image' %
                (image_file))
            image_file = None
    try:
        filename = os.path.basename(image_file)
    except:
        filename = None

    try:
        bot_user_oauth_access_token = settings.BOUNDARY_SLACK_OPTS[
            'bot_user_oauth_access_token']
    except:
        logger.error(
            'error :: alert_slack - could not determine bot_user_oauth_access_token'
        )
        return False

    # Allow for absolute path metric namespaces but also allow for and match
    # match wildcard namepaces if there is not an absolute path metric namespace
    channels = 'unknown'
    notify_channels = []
    matched_channels = []
    try:
        channels = settings.BOUNDARY_SLACK_OPTS['channels'][metric_name]
        notify_channels.append(channels)
    except:
        for channel in settings.BOUNDARY_SLACK_OPTS['channels']:
            CHECK_MATCH_PATTERN = channel
            check_match_pattern = re.compile(CHECK_MATCH_PATTERN)
            pattern_match = check_match_pattern.match(metric_name)
            if pattern_match:
                matched_channels.append(channel)

    if matched_channels != []:
        for i_metric_name in matched_channels:
            channels = settings.BOUNDARY_SLACK_OPTS['channels'][i_metric_name]
            notify_channels.append(channels)

    if not notify_channels:
        logger.error('error :: alert_slack - could not determine channel')
        return False
    else:
        channels = notify_channels

    try:
        icon_emoji = settings.BOUNDARY_SLACK_OPTS['icon_emoji']
    except:
        icon_emoji = ':chart_with_upwards_trend:'

    try:
        sc = SlackClient(bot_user_oauth_access_token)
    except:
        logger.info(traceback.format_exc())
        logger.error('error :: alert_slack - could not initiate SlackClient')
        return False

    for channel in channels:
        initial_comment = slack_title + ' :: <' + link + '|graphite image link>\nFor anomaly at ' + slack_time_string
        try:
            # slack does not allow embedded images, nor links behind authentication
            # or color text, so we have jump through all the API hoops to end up
            # having to upload an image with a very basic message.
            if os.path.isfile(image_file):
                slack_file_upload = sc.api_call(
                    'files.upload',
                    filename=filename,
                    channels=channel,
                    initial_comment=initial_comment,
                    file=open(image_file, 'rb'))
                if not slack_file_upload['ok']:
                    logger.error(
                        'error :: alert_slack - failed to send slack message')
            else:
                send_text = initial_comment + '  ::  error :: there was no graph image to upload'
                send_message = sc.api_call('chat.postMessage',
                                           channel=channel,
                                           icon_emoji=icon_emoji,
                                           text=send_text)
                if not send_message['ok']:
                    logger.error(
                        'error :: alert_slack - failed to send slack message')
                else:
                    logger.info('alert_slack - sent slack message')
        except:
            logger.info(traceback.format_exc())
            logger.error('error :: alert_slack - could not upload file')
            return False
Пример #4
0
def calculate_features_profile(current_skyline_app, timestamp, metric,
                               context):
    """
    Calculates a tsfresh features profile from a training data set

    :param timestamp: the timestamp of metric anomaly with training data
    :type timestamp: str
    :param metric: the base_name of the metric
    :type metric: str
    :return: (features_profile_csv_file_path, successful, fail_msg, traceback_format_exc, calc_time)
    :rtype: int
    :rtype: (str, boolean, str, str, str)
    """

    current_skyline_app_logger = current_skyline_app + 'Log'
    current_logger = logging.getLogger(current_skyline_app_logger)

    base_name = str(metric)
    if context == 'training_data':
        log_context = 'training data'
    if context == 'features_profiles':
        log_context = 'features profile data'
    if context == 'ionosphere':
        log_context = 'ionosphere'
    # @added 20170114 - Feature #1854: Ionosphere learn
    if context == 'ionosphere_learn':
        log_context = 'ionosphere :: learn'

    current_logger.info('%s feature profile creation requested for %s at %s' %
                        (log_context, base_name, timestamp))

    timeseries_dir = base_name.replace('.', '/')
    if context == 'training_data' or context == 'ionosphere':
        metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER,
                                        timestamp, timeseries_dir)
    if context == 'features_profiles':
        metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_PROFILES_FOLDER,
                                        timeseries_dir, timestamp)

    # @added 20170113 - Feature #1854: Ionosphere learn
    if context == 'ionosphere_learn':
        metric_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_LEARN_FOLDER,
                                        timestamp, timeseries_dir)

    features_profile_created_file = '%s/%s.%s.fp.created.txt' % (
        metric_data_dir, str(timestamp), base_name)

    features_profile_details_file = '%s/%s.%s.fp.details.txt' % (
        metric_data_dir, str(timestamp), base_name)

    # @added 20170108 - Feature #1842: Ionosphere - Graphite now graphs
    # Added metric_check_file and ts_full_duration is needed to be determined
    # and added the to features_profile_details_file as it was not added here on
    # the 20170104 when it was added the webapp and ionosphere
    metric_var_filename = '%s.txt' % str(base_name)
    anomaly_check_file = '%s/%s' % (metric_data_dir, metric_var_filename)
    ts_full_duration = int(settings.FULL_DURATION)
    if os.path.isfile(anomaly_check_file):
        # Read the details file
        with open(anomaly_check_file, 'r') as f:
            anomaly_details = f.readlines()
            for i, line in enumerate(anomaly_details):
                if 'full_duration' in line:
                    _ts_full_duration = '%s' % str(line).split("'", 2)
                    full_duration_array = literal_eval(_ts_full_duration)
                    ts_full_duration = str(int(full_duration_array[1]))

    anomaly_json = '%s/%s.json' % (metric_data_dir, base_name)
    ts_csv = '%s/%s.tsfresh.input.csv' % (metric_data_dir, base_name)
    #    anomaly_json = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.json'
    #    ts_csv = '/opt/skyline/ionosphere/data/1480104000/stats/statsd/graphiteStats/calculationtime/stats.statsd.graphiteStats.calculationtime.tsfresh.input.csv'
    # This is simply to stay in line with tsfresh naming conventions in their
    # docs and examples
    fname_in = ts_csv
    t_fname_out = fname_in + '.features.transposed.csv'

    fp_id = None
    f_calc = 'unknown'
    if os.path.isfile(features_profile_details_file):
        current_logger.info('features profile details file exist - %s' %
                            (features_profile_details_file))
        try:
            with open(features_profile_details_file, 'r') as f:
                fp_details_str = f.read()
            fp_details_array = literal_eval(fp_details_str)
            f_calc = ' (previously calculated by Ionosphere) - %s' % str(
                fp_details_array[2])
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            current_logger.error('error: failed to read from %s' %
                                 (features_profile_details_file))
    else:
        current_logger.info('OK no features profile details file exists - %s' %
                            (features_profile_details_file))

    fp_created = None
    if os.path.isfile(features_profile_created_file):
        current_logger.info('features profile created file exist - %s' %
                            (features_profile_created_file))
        try:
            with open(features_profile_created_file, 'r') as f:
                fp_created_str = f.read()
            fp_created_array = literal_eval(fp_created_str)
            fp_id = fp_created_array[0]
            fp_created = True
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            current_logger.error('error: failed to read fp_id from %s' %
                                 (features_profile_created_file))
    else:
        current_logger.info('OK no features profile created file exists - %s' %
                            (features_profile_created_file))

    if os.path.isfile(t_fname_out):
        current_logger.info('transposed features already exist - %s' %
                            (t_fname_out))
        return str(
            t_fname_out), True, fp_created, fp_id, 'none', 'none', f_calc

    start = timer()
    if os.path.isfile(anomaly_json):
        try:
            # Read the timeseries json file
            with open(anomaly_json, 'r') as f:
                raw_timeseries = f.read()
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            current_logger.error(
                'error: failed to read timeseries data from %s' %
                (anomaly_json))
            fail_msg = 'error: failed to read timeseries data from %s' % anomaly_json
            end = timer()
            return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc
    else:
        trace = 'none'
        fail_msg = 'error: file not found - %s' % (anomaly_json)
        current_logger.error(fail_msg)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    # Convert the timeseries to csv
    timeseries_array_str = str(raw_timeseries).replace('(',
                                                       '[').replace(')', ']')
    timeseries = literal_eval(timeseries_array_str)

    datapoints = timeseries
    converted = []
    for datapoint in datapoints:
        try:
            new_datapoint = [float(datapoint[0]), float(datapoint[1])]
            converted.append(new_datapoint)
        # @modified 20170913 - Task #2160: Test skyline with bandit
        # Added nosec to exclude from bandit tests
        except:  # nosec
            continue

    if os.path.isfile(ts_csv):
        os.remove(ts_csv)

    for ts, value in converted:
        # print('%s,%s' % (str(int(ts)), str(value)))
        utc_ts_line = '%s,%s,%s\n' % (metric, str(int(ts)), str(value))
        with open(ts_csv, 'a') as fh:
            fh.write(utc_ts_line)

    try:
        df = pd.read_csv(ts_csv,
                         delimiter=',',
                         header=None,
                         names=['metric', 'timestamp', 'value'])
        current_logger.info('DataFrame created with %s' % ts_csv)
    except:
        trace = traceback.format_exc()
        current_logger.error(trace)
        fail_msg = 'error: failed to create a pandas DataFrame with %s' % ts_csv
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            current_logger.info('removed %s' % ts_csv)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

# @added 20161207 - Task #1658: Patterning Skyline Ionosphere
# Coverting the Dataframe types to suit MySQL data types
# For anyone in here if you have done a code review of Skyline there are
# a number of questions that arise from the decision to deviate from json or
# storing msgppack as BLOB etc.  tsfresh used csv and we can csv from Graphite
# etc.  Skyline should be able to handle csv.  As for how data is stored in
# MySQL, this was given considerable review and thought.  Given that Ionosphere
# and Skyline in general should not be limited to the domain of analyzing
# Graphite machine metrics but other timeseries data sources too.
#    df['feature_name'] = df['feature_name'].astype(string)
#    df['value'] = df['value'].astype(float)

# Test the DataFrame
    try:
        df_created = df.head()
        del df_created
    except:
        trace = traceback.format_exc()
        current_logger.debug(trace)
        fail_msg = 'error: failed to read the pandas DataFrame created with %s' % ts_csv
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            current_logger.info('removed %s' % ts_csv)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    df.columns = ['metric', 'timestamp', 'value']

    start_feature_extraction = timer()
    current_logger.info('starting extract_features with %s' %
                        str(TSFRESH_VERSION))
    try:
        # @modified 20161226 - Bug #1822: tsfresh extract_features process stalling
        # Changed to use the new ReasonableFeatureExtractionSettings that was
        # introduced in tsfresh-0.4.0 to exclude the computationally high cost
        # of extracting features from very static timeseries that has little to
        # no variation is the values, which results in features taking up to
        # almost 600 seconds to calculate on a timeseries of length 10075
        # (168h - 1 datapoint per 60s)
        # In terms of inline feature calculatation, always exclude
        # high_comp_cost features.
        # df_features = extract_features(df, column_id='metric', column_sort='timestamp', column_kind=None, column_value=None)
        tsf_settings = ReasonableFeatureExtractionSettings()
        # Disable tqdm progress bar
        tsf_settings.disable_progressbar = True
        df_features = extract_features(
            df,
            column_id='metric',
            column_sort='timestamp',
            column_kind=None,
            column_value=None,
            feature_extraction_settings=tsf_settings)
        current_logger.info('features extracted from %s data' % ts_csv)
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        fail_msg = 'error: extracting features with tsfresh from - %s' % ts_csv
        current_logger.error('%s' % fail_msg)
        end_feature_extraction = timer()
        current_logger.info(
            'feature extraction failed in %.6f seconds' %
            (end_feature_extraction - start_feature_extraction))
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            current_logger.info('removed %s' % ts_csv)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    end_feature_extraction = timer()
    feature_extraction_time = end_feature_extraction - start_feature_extraction
    current_logger.info('feature extraction took %.6f seconds' %
                        (feature_extraction_time))

    # write to disk
    fname_out = fname_in + '.features.csv'
    # df_features.to_csv(fname_out)

    # Transpose
    try:
        df_t = df_features.transpose()
        current_logger.info('features transposed')
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        fail_msg = 'error :: transposing tsfresh features from - %s' % ts_csv
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            current_logger.info('removed %s' % ts_csv)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    # Create transposed features csv
    t_fname_out = fname_in + '.features.transposed.csv'
    try:
        df_t.to_csv(t_fname_out)
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        fail_msg = 'error: saving transposed tsfresh features from - %s' % ts_csv
        current_logger.error('%s' % fail_msg)
        if os.path.isfile(ts_csv):
            os.remove(ts_csv)
            current_logger.info('removed %s' % ts_csv)
        end = timer()
        return 'error', False, fp_created, fp_id, fail_msg, trace, f_calc

    # Calculate the count and sum of the features values
    try:
        df_sum = pd.read_csv(t_fname_out,
                             delimiter=',',
                             header=0,
                             names=['feature_name', 'value'])
        df_sum.columns = ['feature_name', 'value']
        df_sum['feature_name'] = df_sum['feature_name'].astype(str)
        df_sum['value'] = df_sum['value'].astype(float)
    except:
        trace = traceback.print_exc()
        current_logger.error(trace)
        current_logger.error('error :: failed to create Dataframe to sum')
    try:
        features_count = len(df_sum['value'])
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        current_logger.error(
            'error :: failed to count number of features, set to 0')
        features_count = 0
    try:
        features_sum = df_sum['value'].sum()
    except:
        trace = traceback.print_exc()
        current_logger.debug(trace)
        current_logger.error('error :: failed to sum feature values, set to 0')
        features_sum = 0

    end = timer()

    current_logger.info('features saved to %s' % (fname_out))
    current_logger.info('transposed features saved to %s' % (t_fname_out))
    total_calc_time = '%.6f' % (end - start)
    calc_time = '%.6f' % (feature_extraction_time)
    current_logger.info('total feature profile completed in %s seconds' %
                        str(total_calc_time))

    # Create a features profile details file
    try:
        # @modified 20170108 - Feature #1842: Ionosphere - Graphite now graphs
        # Added the ts_full_duration here as it was not added here on the 20170104
        # when it was added the webapp and ionosphere
        data = '[%s, \'%s\', %s, %s, %s, %s]' % (
            str(int(time.time())), str(tsfresh_version), str(calc_time),
            str(features_count), str(features_sum), str(ts_full_duration))
        write_data_to_file(current_skyline_app, features_profile_details_file,
                           'w', data)
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: failed to write %s' % features_profile_details_file
        current_logger.error('%s' % fail_msg)

    if os.path.isfile(ts_csv):
        os.remove(ts_csv)
        current_logger.info('removed the created csv - %s' % ts_csv)

    # @added 20170112 - Feature #1854: Ionosphere learn - Redis ionosphere.learn.work namespace
    # Ionosphere learn needs Redis works sets, but this was moved to
    # ionosphere_backend.py and learn.py not done here

    return str(t_fname_out), True, fp_created, fp_id, 'none', 'none', str(
        calc_time)
Пример #5
0
def run_algorithms(timeseries, timeseries_name, end_timestamp, full_duration,
                   timeseries_file, skyline_app, algorithms, alert_interval,
                   add_to_panorama, padded_timeseries, from_timestamp):
    """
    Iteratively run algorithms.
    """

    results_dir = os.path.dirname(timeseries_file)

    if not os.path.exists(results_dir):
        os.makedirs(results_dir, mode=0o755)

    start_analysis = int(time.time())

    triggered_algorithms = []
    anomalous = False
    # @added 20200427 - Feature #3500: webapp - crucible_process_metrics
    #                   Feature #1448: Crucible web UI
    # Added default alert_interval_discarded_anomalies_count so run_algorithms
    # does not return as failed
    alert_interval_discarded_anomalies_count = 0

    check_algorithms = []
    if str(algorithms) == "['all']":
        if skyline_app == 'analyzer':
            check_algorithms = ALGORITHMS
            logger.info('check_algorithms for analyzer - %s' %
                        (str(check_algorithms)))
        if skyline_app == 'mirage':
            check_algorithms = MIRAGE_ALGORITHMS
            logger.info('check_algorithms for mirage - %s' %
                        (str(check_algorithms)))
        if skyline_app == 'boundary':
            check_algorithms = algorithms
            logger.info('check_algorithms for boundary - %s' %
                        (str(check_algorithms)))
        if skyline_app == 'crucible':
            ALGORITHMS.append('detect_drop_off_cliff')
            check_algorithms = ALGORITHMS
            logger.info('check_algorithms for crucible - %s' %
                        (str(check_algorithms)))
    else:
        check_algorithms = algorithms
        logger.info('check_algorithms specified - %s' %
                    (str(check_algorithms)))

    if not check_algorithms:
        logger.info('check_algorithms unknown - %s' % (str(check_algorithms)))
        ALGORITHMS.append('detect_drop_off_cliff')
        check_algorithms = ALGORITHMS
        logger.info('check_algorithms - %s' % (str(check_algorithms)))

    logger.info('checking algorithms - %s on %s' %
                (str(check_algorithms), str(timeseries_file)))

    # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png
    # Plot Skyline anomalies if CONSENSUS is achieved
    anomalies = []

    # @added 20200422 - Feature #3500: webapp - crucible_process_metrics
    #                      Feature #1448: Crucible web UI
    # Added padded_timeseries.  If the time series is padded then set
    # the range appropriately so that the padded period data points are not
    # analysed for anomalies
    default_range = 10
    if padded_timeseries:
        default_range = 0
        for ts, value in timeseries:
            if int(ts) < from_timestamp:
                default_range += 1
            else:
                break
        logger.info('padded_timeseries - default range set to %s to %s' %
                    (str(default_range), str(timeseries_file)))

    for algorithm in check_algorithms:
        detected = ''
        try:
            x_vals = np.arange(len(timeseries))
            y_vals = np.array([y[1] for y in timeseries])
            # Match default graphite graph size
            plt.figure(figsize=(5.86, 3.08), dpi=100)
            plt.plot(x_vals, y_vals)

            # Start a couple datapoints in for the tail average
            # @modified 20200422 - Feature #3500: webapp - crucible_process_metrics
            #                      Feature #1448: Crucible web UI
            # If the time series is padded then use the appropriate range so
            # that the padded period data points are not analysed for anomalies
            # for index in range(10, len(timeseries)):
            for index in range(default_range, len(timeseries)):
                sliced = timeseries[:index]
                anomaly = globals()[algorithm](sliced, end_timestamp,
                                               full_duration)

                # Point out the datapoint if it's anomalous
                if anomaly:
                    plt.plot([index], [sliced[-1][1]], 'ro')
                    detected = "DETECTED"
                    # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png
                    # Add the anomaly to the anomalies list to plot Skyline
                    # anomalies if CONSENSUS is achieved
                    anomalies.append([sliced[-1][0], sliced[-1][1], algorithm])

            if detected == "DETECTED":
                results_filename = join(results_dir + "/" + algorithm + "." +
                                        detected + ".png")
                logger.info('ANOMALY DETECTED :: with %s on %s' %
                            (algorithm, str(timeseries_file)))
                anomalous = True
                triggered_algorithms.append(algorithm)
            else:
                results_filename = join(results_dir + "/" + algorithm + ".png")

            try:
                plt.savefig(results_filename, dpi=100)
                logger.info('saved %s plot :: %s' %
                            (algorithm, results_filename))
                if python_version == 2:
                    # @modified 20200327 - Branch #3262: py3
                    # os.chmod(results_filename, 0644)
                    os.chmod(results_filename, 0o644)
                if python_version == 3:
                    os.chmod(results_filename, mode=0o644)
            except:
                logger.error('error :: %s' % (traceback.format_exc()))
                logger.error('error :: failed to save %s for %s' %
                             (str(results_filename), str(timeseries_file)))
        except:
            logger.error('error :: %s' % (traceback.format_exc()))
            logger.error(
                'error :: error thrown in algorithm running and plotting - %s on %s'
                % (str(algorithm), str(timeseries_file)))

    end_analysis = int(time.time())
    # @modified 20160814 - pyflaked
    # seconds_to_run = end_analysis - start_analysis
    #    logger.info(
    #        'analysis of %s at a full duration of %s took %s seconds' %
    #        (timeseries_name, str(full_duration), str(seconds_to_run)))

    # @added 20200421 - Feature #3500: webapp - crucible_process_metrics
    #                   Feature #1448: Crucible web UI
    # Added last_anomaly_timestamp to apply alert_interval against and
    # alert_interval_discarded_anomalies.  If the alert interval is passed
    # Crucible will only report Skyline CONSENSUS anomalies if the time between
    # the last anomaly is not alert_interval less than the specified
    # alert_interval period.  This enables Crucible to mimic Analyzer and Mirage
    # and apply a EXPIRATION_TIME type methodology to identifying anomalies like
    # Analyzer would.  This makes Crucible work SOMEWHAT like Analyzer, however
    # is still a bit different as with Crucible the time series grows, like a
    # new metric would.
    # Set the last_anomaly_timestamp to the appropriate timestamp before the
    # alert_interval if alert_interval is set, if it is not it does not matter
    # as alert_interval and alert_interval_discarded_anomalies will not be
    # applied.
    # @modified 20200427 - Feature #3500: webapp - crucible_process_metrics
    #                      Feature #1448: Crucible web UI
    # Wrap timeseries_start_timestamp variable in try so on fail the process
    # does not hang
    try:
        timeseries_start_timestamp = int(timeseries[0][0])
    except:
        logger.error('error :: %s' % (traceback.format_exc()))
        logger.error(
            'error :: failed to determine timeseries_start_timestamp from %s' %
            str(timeseries_file))
        timeseries_start_timestamp = 0

    # @modified 20200427 - Feature #3500: webapp - crucible_process_metrics
    #                      Feature #1448: Crucible web UI
    # if alert_interval:
    last_anomaly_timestamp = timeseries_start_timestamp
    if alert_interval and timeseries_start_timestamp:
        last_anomaly_timestamp = timeseries_start_timestamp - (alert_interval +
                                                               1)
    else:
        last_anomaly_timestamp = timeseries_start_timestamp
    alert_interval_discarded_anomalies = []
    # To apply alert_interval the anomalies object needs to be sorted by
    # timestamp as the anomalies are added per algorithm so they are not
    # timestamp ordered, but timestamp ordered per algorithm
    if anomalies and alert_interval:
        try:
            logger.info(
                'info :: last_anomaly_timestamp set to %s for alert_interval check on %s'
                % (str(last_anomaly_timestamp), str(timeseries_file)))
            logger.info(
                'info :: sorting anomalies %s to apply alert_interval check on %s'
                % (str(len(anomalies)), str(timeseries_file)))
            sorted_anomalies = sorted(anomalies, key=lambda x: x[0])
            anomalies = sorted_anomalies
            del sorted_anomalies
        except:
            logger.error('error :: %s' % (traceback.format_exc()))
            logger.error('error :: falied to create sorted_anomalies on %s' %
                         str(timeseries_file))

    # @added 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms
    # Allow the user to pass run_algorithms to run
    use_consensus = 6
    try:
        try:
            from settings import CONSENSUS as use_consensus
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: falied to set uSE_CONSENSUS')
            use_consensus = 6
        if len(check_algorithms) <= use_consensus:
            use_consensus = len(check_algorithms)
            logger.info(
                'check_algorithms passed with the number of algorithms less than CONSENSUS, use_consensus set to %s'
                % (str(use_consensus)))
    except:
        logger.error(traceback.format_exc())
        logger.error('error :: falied to set CONSENSUS')

    # @added 20190611 - Feature #3106: crucible - skyline.consensus.anomalies.png
    # Plot Skyline anomalies where CONSENSUS achieved and create file resources
    # skyline.anomalies_score.txt and skyline.anomalies.csv
    anomalies_score = []
    if anomalies:
        for ts, value, algo in anomalies:
            try:
                processed = False
                algorithms_triggered = []
                if anomalies_score:
                    for i in anomalies_score:
                        if i[0] == ts:
                            processed = True
                            continue
                if processed:
                    continue
                for w_ts, w_value, w_algo in anomalies:
                    if w_ts == ts:
                        algorithms_triggered.append(w_algo)
                # @added 20200421 - Feature #3500: webapp - crucible_process_metrics
                #                      Feature #1448: Crucible web UI
                # Added last_anomaly_timestamp to apply alert_interval against and
                # alert_interval_discarded_anomalies.  If the alert interval is passed
                append_anomaly = True

                if algorithms_triggered:
                    consensus = len(algorithms_triggered)

                    # @added 20200421 - Feature #3500: webapp - crucible_process_metrics
                    #                      Feature #1448: Crucible web UI
                    # Added last_anomaly_timestamp to apply alert_interval against and
                    # alert_interval_discarded_anomalies.  If the alert interval is passed
                    # @modified 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms
                    # if consensus >= CONSENSUS:
                    if consensus >= use_consensus:
                        current_anomaly_timestamp = int(ts)
                        if alert_interval and last_anomaly_timestamp:
                            time_between_anomalies = current_anomaly_timestamp - last_anomaly_timestamp
                            if time_between_anomalies < alert_interval:
                                try:
                                    discard_anomaly = [
                                        ts, value, consensus,
                                        algorithms_triggered
                                    ]
                                    # This logs a lot if enabled
                                    # logger.info('debug :: time_between_anomalies %s is less than alert_interval %s, last_anomaly_timestamp set to %s and current_anomaly_timestamp is %s - discarding %s' % (
                                    #     str(time_between_anomalies), str(alert_interval),
                                    #     str(last_anomaly_timestamp),
                                    #     str(current_anomaly_timestamp), str(discard_anomaly)))
                                    alert_interval_discarded_anomalies.append(
                                        discard_anomaly)
                                    append_anomaly = False
                                except:
                                    logger.error(traceback.format_exc())
                                    logger.error(
                                        'error :: falied to append to alert_interval_discarded_anomalies on %s'
                                        % str(timeseries_file))

                    # @modified 20200421 -  Feature #3500: webapp - crucible_process_metrics
                    #                       Feature #1448: Crucible web UI
                    # Only append if append_anomaly
                    # anomalies_score.append([ts, value, consensus, algorithms_triggered])
                    if append_anomaly:
                        anomalies_score.append(
                            [ts, value, consensus, algorithms_triggered])
                        # @modified 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms
                        # if consensus >= CONSENSUS:
                        if consensus >= use_consensus:
                            last_anomaly_timestamp = int(ts)
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: falied to process anomalies entry on %s' %
                    str(timeseries_file))

        # @added 20200421 - Feature #3500: webapp - crucible_process_metrics
        #                   Feature #1448: Crucible web UI
        # Added alert_interval_discarded_anomalies
        if alert_interval:
            if alert_interval_discarded_anomalies:
                logger.info(
                    'info :: discarded %s anomalies due to them being within the alert_interval period on %s'
                    % (str(len(alert_interval_discarded_anomalies)),
                       str(timeseries_file)))
            else:
                logger.info(
                    'info :: no anomalies were discarded due to them being within the alert_interval period on %s'
                    % str(timeseries_file))

        try:
            logger.info(
                'info :: plotting skyline.consensus.anomalies.png for %s' %
                str(timeseries_file))
            x_vals = np.arange(len(timeseries))
            y_vals = np.array([y[1] for y in timeseries])
            # Match default graphite graph size
            plt.figure(figsize=(5.86, 3.08), dpi=100)
            plt.plot(x_vals, y_vals)
            for index in range(10, len(timeseries)):
                anomaly = False
                sliced = timeseries[:index]
                for i in anomalies_score:
                    if sliced[-1][0] == i[0]:
                        # @modified 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms
                        # if i[2] >= CONSENSUS:
                        if i[2] >= use_consensus:
                            anomaly = True
                # Point out the datapoint if it is anomalous according to
                # Skyline CONSENSUS
                if anomaly:
                    plt.plot([index], [sliced[-1][1]], 'ro')
            results_filename = join(results_dir +
                                    "/skyline.consensus.anomalies.png")
            plt.savefig(results_filename, dpi=100)
            if python_version == 2:
                # @modified 20200327 - Branch #3262: py3
                # os.chmod(results_filename, 0644)
                os.chmod(results_filename, 0o644)
            if python_version == 3:
                os.chmod(results_filename, mode=0o644)
        except:
            logger.error('error :: %s' % (traceback.format_exc()))
            logger.error(
                'error :: failed plotting skyline.consensus.anomalies.png for %s'
                % str(timeseries_file))

        anomalies_filename = join(results_dir + "/skyline.anomalies_score.txt")
        try:
            logger.info('info :: creating anomalies_filename - %s for %s' %
                        (anomalies_filename, str(timeseries_file)))
            write_data_to_file(skyline_app, anomalies_filename, 'w',
                               str(anomalies_score))
        except:
            logger.error('error :: %s' % (traceback.format_exc()))
            logger.error(
                'error :: failed creating anomalies_filename - %s for %s' %
                (anomalies_filename, str(timeseries_file)))

        anomalies_csv = join(results_dir + "/skyline.anomalies.csv")
        logger.info('info :: creating anomalies_csv - %s for %s' %
                    (anomalies_csv, str(timeseries_file)))
        try:
            with open(anomalies_csv, 'w') as fh:
                fh.write(
                    'timstamp,value,consensus_count,triggered_algorithms\n')
            for ts, value, consensus, algorithms_triggered in anomalies_score:
                try:
                    algos_str = str(algorithms_triggered)
                    triggered_algorithms = algos_str.replace(',', ' ')
                    line = '%s,%s,%s,%s\n' % (str(ts), str(value),
                                              str(consensus),
                                              str(triggered_algorithms))
                    with open(anomalies_csv, 'a') as fh:
                        fh.write(line)
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: could not write to file %s for %s' %
                                 (anomalies_csv, str(timeseries_file)))
            if python_version == 2:
                # @modified 20200327 - Branch #3262: py3
                # os.chmod(anomalies_csv, 0644)
                os.chmod(anomalies_csv, 0o644)
            if python_version == 3:
                os.chmod(anomalies_csv, mode=0o644)
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: could not write to file %s for %s' %
                         (anomalies_csv, str(timeseries_file)))
        logger.info('info :: created anomalies_csv OK for %s' %
                    str(timeseries_file))
        # @added 20200421 - Feature #3500: webapp - crucible_process_metrics
        #                      Feature #1448: Crucible web UI
        # Added alert_interval_discarded_anomalies
        alert_interval_discarded_anomalies_count = len(
            alert_interval_discarded_anomalies)
        if alert_interval_discarded_anomalies:
            alert_interval_discarded_anomalies_csv = join(
                results_dir +
                '/skyline.alert_interval_discarded_anomalies.csv')
            logger.info(
                'info :: writing %s alert_interval discarded anomalies to %s for %s'
                %
                (str(len(alert_interval_discarded_anomalies)),
                 alert_interval_discarded_anomalies_csv, str(timeseries_file)))
            try:
                with open(alert_interval_discarded_anomalies_csv, 'w') as fh:
                    fh.write('timstamp,value,consensus,triggered_algorithms\n')
                for ts, value, consensus, algorithms_triggered in alert_interval_discarded_anomalies:
                    try:
                        line = '%s,%s,%s,%s\n' % (str(ts), str(value),
                                                  str(consensus),
                                                  str(algorithms_triggered))
                        with open(alert_interval_discarded_anomalies_csv,
                                  'a') as fh:
                            fh.write(line)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: could not write to file %s for %s' %
                            (alert_interval_discarded_anomalies_csv,
                             str(timeseries_file)))
                if python_version == 2:
                    os.chmod(alert_interval_discarded_anomalies_csv, 0o644)
                if python_version == 3:
                    os.chmod(alert_interval_discarded_anomalies_csv,
                             mode=0o644)
            except:
                logger.error(traceback.format_exc())
                logger.error('error :: could not write to file %s for %s' %
                             (alert_interval_discarded_anomalies_csv,
                              str(timeseries_file)))
    else:
        logger.info('0 anomalies found for %s' % str(timeseries_file))

    return anomalous, triggered_algorithms, alert_interval_discarded_anomalies_count
Пример #6
0
def alert_smtp(alert, metric, context):
    """
    Called by :func:`~trigger_alert` and sends an alert via smtp to the
    recipients that are configured for the metric.

    """
    LOCAL_DEBUG = False
    logger = logging.getLogger(skyline_app_logger)
    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
        logger.info('debug :: alert_smtp - sending smtp alert')
        logger.info('debug :: alert_smtp - Memory usage at start: %s (kb)' %
                    resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    # FULL_DURATION to hours so that analyzer surfaces the relevant timeseries data
    # in the graph
    full_duration_in_hours = int(settings.FULL_DURATION) / 3600

    # @added 20161229 - Feature #1830: Ionosphere alerts
    # Added Ionosphere variables
    base_name = str(metric[1]).replace(settings.FULL_NAMESPACE, '', 1)
    if settings.IONOSPHERE_ENABLED:
        timeseries_dir = base_name.replace('.', '/')
        training_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER,
                                          str(int(metric[2])), timeseries_dir)
        graphite_image_file = '%s/%s.%s.graphite.%sh.png' % (
            training_data_dir, base_name, skyline_app,
            str(int(full_duration_in_hours)))
        json_file = '%s/%s.%s.redis.%sh.json' % (
            training_data_dir, base_name, skyline_app,
            str(int(full_duration_in_hours)))
        training_data_redis_image = '%s/%s.%s.redis.plot.%sh.png' % (
            training_data_dir, base_name, skyline_app,
            str(int(full_duration_in_hours)))

    # For backwards compatibility
    if '@' in alert[1]:
        sender = settings.ALERT_SENDER
        recipient = alert[1]
    else:
        sender = settings.SMTP_OPTS['sender']
        # @modified 20160806 - Added default_recipient
        try:
            recipients = settings.SMTP_OPTS['recipients'][alert[0]]
            use_default_recipient = False
        except:
            use_default_recipient = True
        if use_default_recipient:
            try:
                recipients = settings.SMTP_OPTS['default_recipient']
                logger.info(
                    'alert_smtp - using default_recipient as no recipients are configured for %s'
                    % str(alert[0]))
            except:
                logger.error(
                    'error :: alert_smtp - no known recipient for %s' %
                    str(alert[0]))
                return False

    # Backwards compatibility
    if type(recipients) is str:
        recipients = [recipients]

    # @added 20180524 - Task #2384: Change alerters to cc other recipients
    # The alerters did send an individual email to each recipient. This would be
    # more useful if one email was sent with the first smtp recipient being the
    # to recipient and the subsequent recipients were add in cc.
    if recipients:
        primary_recipient = False
        cc_recipients = False
        for i_recipient in recipients:
            if not primary_recipient:
                primary_recipient = str(i_recipient)
            if primary_recipient != i_recipient:
                if not cc_recipients:
                    cc_recipients = str(i_recipient)
                else:
                    new_cc_recipients = '%s,%s' % (str(cc_recipients),
                                                   str(i_recipient))
                    cc_recipients = str(new_cc_recipients)
        logger.info(
            'alert_smtp - will send to primary_recipient :: %s, cc_recipients :: %s'
            % (str(primary_recipient), str(cc_recipients)))

    # @modified 20161229 - Feature #1830: Ionosphere alerts
    # Ionosphere alerts
    unencoded_graph_title = 'Skyline %s - ALERT at %s hours - %s' % (
        context, str(int(full_duration_in_hours)), str(metric[0]))
    # @modified 20170603 - Feature #2034: analyse_derivatives
    # Added deriative functions to convert the values of metrics strictly
    # increasing monotonically to their deriative products in alert graphs and
    # specify it in the graph_title
    known_derivative_metric = False
    try:
        # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
        if settings.REDIS_PASSWORD:
            REDIS_ALERTER_CONN = redis.StrictRedis(
                password=settings.REDIS_PASSWORD,
                unix_socket_path=settings.REDIS_SOCKET_PATH)
        else:
            REDIS_ALERTER_CONN = redis.StrictRedis(
                unix_socket_path=settings.REDIS_SOCKET_PATH)
    except:
        logger.error(traceback.format_exc())
        logger.error('error :: alert_smtp - redis connection failed')
    try:
        derivative_metrics = list(
            REDIS_ALERTER_CONN.smembers('derivative_metrics'))
    except:
        derivative_metrics = []
    redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(base_name))
    if redis_metric_name in derivative_metrics:
        known_derivative_metric = True
    if known_derivative_metric:
        try:
            non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS
        except:
            non_derivative_monotonic_metrics = []
        skip_derivative = in_list(redis_metric_name,
                                  non_derivative_monotonic_metrics)
        if skip_derivative:
            known_derivative_metric = False
    if known_derivative_metric:
        unencoded_graph_title = 'Skyline %s - ALERT at %s hours - derivative graph - %s' % (
            context, str(int(full_duration_in_hours)), str(metric[0]))

    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
        logger.info('debug :: alert_smtp - unencoded_graph_title: %s' %
                    unencoded_graph_title)
    graph_title_string = quote(unencoded_graph_title, safe='')
    graph_title = '&title=%s' % graph_title_string

    graphite_port = '80'
    if settings.GRAPHITE_PORT != '':
        graphite_port = str(settings.GRAPHITE_PORT)

    link = '%s://%s:%s/render/?from=-%shours&target=cactiStyle(%s)%s%s&colorList=orange' % (
        settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port,
        str(int(full_duration_in_hours)), metric[1],
        settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
    # @added 20170603 - Feature #2034: analyse_derivatives
    if known_derivative_metric:
        link = '%s://%s:%s/render/?from=-%shours&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % (
            settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port,
            str(int(full_duration_in_hours)), metric[1],
            settings.GRAPHITE_GRAPH_SETTINGS, graph_title)

    content_id = metric[1]
    image_data = None
    if settings.SMTP_OPTS.get('embed-images'):
        # @added 20161229 - Feature #1830: Ionosphere alerts
        # Use existing data if files exist
        if os.path.isfile(graphite_image_file):
            try:
                with open(graphite_image_file, 'r') as f:
                    image_data = f.read()
                logger.info('alert_smtp - using existing png - %s' %
                            graphite_image_file)
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: alert_smtp - failed to read image data from existing png - %s'
                    % graphite_image_file)
                logger.error('error :: alert_smtp - %s' % str(link))
                image_data = None

        if image_data is None:
            try:
                # @modified 20170913 - Task #2160: Test skyline with bandit
                # Added nosec to exclude from bandit tests
                image_data = urllib2.urlopen(link).read()  # nosec
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - image data OK')
            except urllib2.URLError:
                logger.error(traceback.format_exc())
                logger.error('error :: alert_smtp - failed to get image graph')
                logger.error('error :: alert_smtp - %s' % str(link))
                image_data = None
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - image data None')

    if LOCAL_DEBUG:
        logger.info(
            'debug :: alert_smtp - Memory usage after image_data: %s (kb)' %
            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    # If we failed to get the image or if it was explicitly disabled,
    # use the image URL instead of the content.
    if image_data is None:
        img_tag = '<img src="%s"/>' % link
    else:
        img_tag = '<img src="cid:%s"/>' % content_id
        if settings.ENABLE_DEBUG or LOCAL_DEBUG:
            logger.info('debug :: alert_smtp - img_tag: %s' % img_tag)

        if settings.IONOSPHERE_ENABLED:
            # Create Ionosphere Graphite image
            # @modified 20161229 - Feature #1830: Ionosphere alerts
            # Only write the data to the file if it does not exist
            if not os.path.isfile(graphite_image_file):
                try:
                    write_data_to_file(skyline_app, graphite_image_file, 'w',
                                       image_data)
                    logger.info('added %s Ionosphere Graphite image :: %s' %
                                (skyline_app, graphite_image_file))
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: failed to add %s Ionosphere Graphite image' %
                        (skyline_app, graphite_image_file))
            else:
                logger.info(
                    '%s Ionosphere Graphite image already exists :: %s' %
                    (skyline_app, graphite_image_file))

    redis_image_data = None
    try:
        plot_redis_data = settings.PLOT_REDIS_DATA
    except:
        plot_redis_data = False

    if settings.SMTP_OPTS.get('embed-images') and plot_redis_data:
        # Create graph from Redis data
        redis_metric_key = '%s%s' % (settings.FULL_NAMESPACE, metric[1])
        try:
            raw_series = REDIS_ALERTER_CONN.get(redis_metric_key)
            if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                logger.info('debug :: alert_smtp - raw_series: %s' % 'OK')
        except:
            if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                logger.info('debug :: alert_smtp - raw_series: %s' % 'FAIL')

        try:
            if LOCAL_DEBUG:
                logger.info(
                    'debug :: alert_smtp - Memory usage before get Redis timeseries data: %s (kb)'
                    % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
            unpacker = Unpacker(use_list=True)
            unpacker.feed(raw_series)
            timeseries_x = [float(item[0]) for item in unpacker]
            unpacker = Unpacker(use_list=True)
            unpacker.feed(raw_series)
            timeseries_y = [item[1] for item in unpacker]

            unpacker = Unpacker(use_list=False)
            unpacker.feed(raw_series)
            timeseries = list(unpacker)
            if LOCAL_DEBUG:
                logger.info(
                    'debug :: alert_smtp - Memory usage after get Redis timeseries data: %s (kb)'
                    % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        except:
            logger.error('error :: alert_smtp - unpack timeseries failed')
            timeseries = None

        if settings.IONOSPHERE_ENABLED and timeseries:
            '''
            .. todo: this is possibly to be used to allow the user to submit the
                FULL_DURATION duration data set for the features profile to be
                created against IF it is a Mirage metric.  This would allow for
                additional granularity in Mirage metrics, thereby maintaining
                their seasonality, but allow user and Skyline to analyze the
                anomaly at a FULL_DURATION resolution as well.  Not sure how to
                code that in Ionosphere context yet but could just be additonal
                flag in the Ionosphere record.  In the Ionosphere frontend, the
                user would be given an option to either create the features
                profile on the Mirage timeseries or the redis FULL_DURATION
                timeseries.  It is a little complicated, but doable.
                # @modified 20161229 - Feature #1828: ionosphere - mirage Redis data features
                However that ^^ is UNDESIRABLE in the Mirage/Ionosphere context
                at the moment.  Ionosphere must only profile SECOND_ORDER_RESOLUTION_HOURS
                currently so as to not pollute the seasonality aspect of Mirage
            '''
            # Create Ionosphere redis timeseries json if is does not exist
            # @modified 20161229 - Feature #1830: Ionosphere alerts
            # Only write the data to the file if it does not exist and replace
            # the timeseries object if a json file exists

            # @added 20170920 - Bug #2168: Strange Redis derivative graph
            using_original_redis_json = False

            if not os.path.isfile(json_file):
                timeseries_json = str(timeseries).replace('[', '(').replace(
                    ']', ')')
                try:
                    write_data_to_file(skyline_app, json_file, 'w',
                                       timeseries_json)
                    logger.info(
                        'added %s Ionosphere Redis data timeseries json file :: %s'
                        % (skyline_app, json_file))
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: failed to add %s Ionosphere Redis data timeseries json file'
                        % (skyline_app, json_file))
            else:
                # Replace the timeseries object
                logger.info(
                    '%s Ionosphere Redis data timeseries json file already exists, using :: %s'
                    % (skyline_app, json_file))
                anomaly_json = json_file
                try:
                    # Read the timeseries json file
                    with open(anomaly_json, 'r') as f:
                        raw_timeseries = f.read()
                    timeseries_array_str = str(raw_timeseries).replace(
                        '(', '[').replace(')', ']')
                    timeseries = literal_eval(timeseries_array_str)
                    logger.info(
                        '%s Redis timeseries replaced with timeseries from :: %s'
                        % (skyline_app, anomaly_json))
                    timeseries_x = [float(item[0]) for item in timeseries]
                    timeseries_y = [item[1] for item in timeseries]
                    # @added 20170920 - Bug #2168: Strange Redis derivative graph
                    # This already has nonNegativeDerivative applied to it
                    using_original_redis_json = True
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: %s failed to read timeseries data from %s' %
                        (skyline_app, anomaly_json))
                    timeseries = None

        # @added 20170603 - Feature #2034: analyse_derivatives
        if known_derivative_metric:

            # @added 20170920 - Bug #2168: Strange Redis derivative graph
            # If this is the Mirage Redis json it already has
            # nonNegativeDerivative applied to it
            if not using_original_redis_json:
                logger.info('alert_smtp - nonNegativeDerivative being applied')

                try:
                    derivative_timeseries = nonNegativeDerivative(timeseries)
                    timeseries = derivative_timeseries
                    # @added 20170920 - Bug #2168: Strange Redis derivative graph
                    logger.info('alert_smtp - nonNegativeDerivative applied')
                except:
                    logger.error(
                        'error :: alert_smtp - nonNegativeDerivative failed')
            else:
                logger.info(
                    'alert_smtp - nonNegativeDerivative not being applied, as it will have been applied in the original json'
                )

        # @added 21070726 - Bug #2068: Analyzer smtp alert error on Redis plot with derivative metrics
        # If the nonNegativeDerivative has been calculated we need to reset the
        # x and y as nonNegativeDerivative has to discard the first value as it
        # has no delta for it so the timeseries is 1 item less.
        timeseries_x = [float(item[0]) for item in timeseries]
        timeseries_y = [item[1] for item in timeseries]

        pd_series_values = None
        if timeseries:
            try:
                if LOCAL_DEBUG:
                    logger.info(
                        'debug :: alert_smtp - Memory usage before pd.Series: %s (kb)'
                        % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
                values = pd.Series([x[1] for x in timeseries])
                # Because the truth value of a Series is ambiguous
                pd_series_values = True
                if LOCAL_DEBUG:
                    logger.info(
                        'debug :: alert_smtp - Memory usage after pd.Series: %s (kb)'
                        % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
            except:
                logger.error(
                    'error :: alert_smtp - pandas value series on timeseries failed'
                )

        if pd_series_values:
            try:
                array_median = np.median(values)
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - values median: %s' %
                                str(array_median))

                array_amax = np.amax(values)
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - array_amax: %s' %
                                str(array_amax))
                array_amin = np.amin(values)
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - array_amin: %s' %
                                str(array_amin))
                mean = values.mean()
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - mean: %s' % str(mean))
                stdDev = values.std()
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - stdDev: %s' %
                                str(stdDev))

                sigma3 = 3 * stdDev
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - sigma3: %s' %
                                str(sigma3))

                # sigma3_series = [sigma3] * len(values)

                sigma3_upper_bound = mean + sigma3
                try:
                    sigma3_lower_bound = mean - sigma3
                except:
                    sigma3_lower_bound = 0

                sigma3_upper_series = [sigma3_upper_bound] * len(values)
                sigma3_lower_series = [sigma3_lower_bound] * len(values)
                amax_series = [array_amax] * len(values)
                amin_series = [array_amin] * len(values)
                mean_series = [mean] * len(values)
            except:
                logger.error(
                    'error :: alert_smtp - numpy ops on series failed')
                mean_series = None

        if mean_series:
            graph_title = 'Skyline %s - ALERT - at %s hours - Redis data\n%s - anomalous value: %s' % (
                context, str(
                    int(full_duration_in_hours)), metric[1], str(metric[0]))
            # @added 20170603 - Feature #2034: analyse_derivatives
            if known_derivative_metric:
                graph_title = 'Skyline %s - ALERT - at %s hours - Redis data (derivative graph)\n%s - anomalous value: %s' % (
                    context, str(int(full_duration_in_hours)), metric[1],
                    str(metric[0]))

            # @modified 20160814 - Bug #1558: Memory leak in Analyzer
            # I think the buf is causing a memory leak, trying a file
            # if python_version == 3:
            #     buf = io.StringIO()
            # else:
            #     buf = io.BytesIO()
            buf = '%s/%s.%s.%s.png' % (settings.SKYLINE_TMP_DIR, skyline_app,
                                       str(int(metric[2])), metric[1])

            if LOCAL_DEBUG:
                logger.info(
                    'debug :: alert_smtp - Memory usage before plot Redis data: %s (kb)'
                    % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

            # Too big
            # rcParams['figure.figsize'] = 12, 6
            rcParams['figure.figsize'] = 8, 4
            try:
                # fig = plt.figure()
                fig = plt.figure(frameon=False)
                ax = fig.add_subplot(111)
                ax.set_title(graph_title, fontsize='small')
                # @modified 20180417 - Bug #2358: set_axis_bgcolor method removed from Matplotlib - Luminosity
                #                      IssueID #49 'AxesSubplot' object has no attribute 'set_axis_bgcolor'
                # ax.set_axis_bgcolor('black')
                if hasattr(ax, 'set_facecolor'):
                    ax.set_facecolor('black')
                else:
                    ax.set_axis_bgcolor('black')

                try:
                    datetimes = [
                        dt.datetime.utcfromtimestamp(ts) for ts in timeseries_x
                    ]
                    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                        logger.info('debug :: alert_smtp - datetimes: %s' %
                                    'OK')
                except:
                    logger.error('error :: alert_smtp - datetimes: %s' %
                                 'FAIL')

                plt.xticks(rotation=0, horizontalalignment='center')
                xfmt = DateFormatter('%a %H:%M')
                plt.gca().xaxis.set_major_formatter(xfmt)

                ax.xaxis.set_major_formatter(xfmt)

                ax.plot(datetimes,
                        timeseries_y,
                        color='orange',
                        lw=0.6,
                        zorder=3)
                ax.tick_params(axis='both', labelsize='xx-small')

                max_value_label = 'max - %s' % str(array_amax)
                ax.plot(datetimes,
                        amax_series,
                        lw=1,
                        label=max_value_label,
                        color='m',
                        ls='--',
                        zorder=4)
                min_value_label = 'min - %s' % str(array_amin)
                ax.plot(datetimes,
                        amin_series,
                        lw=1,
                        label=min_value_label,
                        color='b',
                        ls='--',
                        zorder=4)
                mean_value_label = 'mean - %s' % str(mean)
                ax.plot(datetimes,
                        mean_series,
                        lw=1.5,
                        label=mean_value_label,
                        color='g',
                        ls='--',
                        zorder=4)

                sigma3_text = (r'3$\sigma$')
                # sigma3_label = '%s - %s' % (str(sigma3_text), str(sigma3))

                sigma3_upper_label = '%s upper - %s' % (
                    str(sigma3_text), str(sigma3_upper_bound))
                ax.plot(datetimes,
                        sigma3_upper_series,
                        lw=1,
                        label=sigma3_upper_label,
                        color='r',
                        ls='solid',
                        zorder=4)

                if sigma3_lower_bound > 0:
                    sigma3_lower_label = '%s lower - %s' % (
                        str(sigma3_text), str(sigma3_lower_bound))
                    ax.plot(datetimes,
                            sigma3_lower_series,
                            lw=1,
                            label=sigma3_lower_label,
                            color='r',
                            ls='solid',
                            zorder=4)

                ax.get_yaxis().get_major_formatter().set_useOffset(False)
                ax.get_yaxis().get_major_formatter().set_scientific(False)

                # Shrink current axis's height by 10% on the bottom
                box = ax.get_position()
                ax.set_position([
                    box.x0, box.y0 + box.height * 0.1, box.width,
                    box.height * 0.9
                ])

                # Put a legend below current axis
                ax.legend(loc='upper center',
                          bbox_to_anchor=(0.5, -0.05),
                          fancybox=True,
                          shadow=True,
                          ncol=4,
                          fontsize='x-small')
                plt.rc('lines', lw=2, color='w')

                plt.grid(True)

                ax.grid(b=True,
                        which='both',
                        axis='both',
                        color='lightgray',
                        linestyle='solid',
                        alpha=0.5,
                        linewidth=0.6)
                # @modified 20180417 - Bug #2358: set_axis_bgcolor method removed from Matplotlib - Luminosity
                #                      IssueID #49 'AxesSubplot' object has no attribute 'set_axis_bgcolor'
                # ax.set_axis_bgcolor('black')
                if hasattr(ax, 'set_facecolor'):
                    ax.set_facecolor('black')
                else:
                    ax.set_axis_bgcolor('black')

                rcParams['xtick.direction'] = 'out'
                rcParams['ytick.direction'] = 'out'
                ax.margins(y=.02, x=.03)
                # tight_layout removes the legend box
                # fig.tight_layout()
                try:
                    if LOCAL_DEBUG:
                        logger.info(
                            'debug :: alert_smtp - Memory usage before plt.savefig: %s (kb)'
                            %
                            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
                    plt.savefig(buf, format='png')

                    if settings.IONOSPHERE_ENABLED:
                        if not os.path.exists(training_data_dir):
                            mkdir_p(training_data_dir)
                            logger.info('created dir - %s' % training_data_dir)

                        if not os.path.isfile(training_data_redis_image):
                            try:
                                plt.savefig(training_data_redis_image,
                                            format='png')
                                logger.info(
                                    'alert_smtp - save Redis training data image - %s'
                                    % (training_data_redis_image))
                            except:
                                logger.info(traceback.format_exc())
                                logger.error(
                                    'error :: alert_smtp - could not save - %s'
                                    % (training_data_redis_image))
                        else:
                            logger.info(
                                'alert_smtp - Redis training data image already exists - %s'
                                % (training_data_redis_image))

                    # @added 20160814 - Bug #1558: Memory leak in Analyzer
                    # As per http://www.mail-archive.com/[email protected]/msg13222.html
                    # savefig in the parent process was causing the memory leak
                    # the below fig.clf() and plt.close() did not resolve this
                    # however spawing a multiprocessing process for alert_smtp
                    # does solve this as issue as all memory is freed when the
                    # process terminates.
                    fig.clf()
                    plt.close(fig)
                    redis_graph_content_id = 'redis.%s' % metric[1]
                    redis_image_data = True
                    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                        logger.info('debug :: alert_smtp - savefig: %s' % 'OK')
                        logger.info(
                            'debug :: alert_smtp - Memory usage after plt.savefig: %s (kb)'
                            %
                            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
                except:
                    logger.info(traceback.format_exc())
                    logger.error('error :: alert_smtp - plt.savefig: %s' %
                                 'FAIL')
            except:
                logger.error(traceback.format_exc())
                logger.error('error :: alert_smtp - could not build plot')

    if LOCAL_DEBUG:
        logger.info(
            'debug :: alert_smtp - Memory usage before email: %s (kb)' %
            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    if redis_image_data:
        redis_img_tag = '<img src="cid:%s"/>' % redis_graph_content_id
        if settings.ENABLE_DEBUG or LOCAL_DEBUG:
            logger.info('debug :: alert_smtp - redis_img_tag: %s' %
                        str(redis_img_tag))
    else:
        # @modified 20161229 - Feature #1830: Ionosphere alerts
        # @modified 20170108 - Feature #1852: Ionosphere - features_profile matched graphite graphs
        # Restored the previous redis_img_tag method as some smtp alerts were
        # coming without a Redis graph, not all but some and for some reason,
        # I am pretty certain retrospectively that it was done that way from
        # testing I just wanted to try and be cleaner.
        # The redis_img_tag was changed at
        # https://github.com/earthgecko/skyline/commit/31bcacf3f90f0953ebed0d57260cb937e01f887c#diff-520bf2a218f65074ffead4d8184c138dR489
        redis_img_tag = '<img src="%s"/>' % 'none'
        # redis_img_tag = '<img src="none"/>'

    # @added 20170806 - Feature #1830: Ionosphere alerts
    # Show a human date in alerts
    alerted_at = str(dt.datetime.utcfromtimestamp(int(metric[2])))

    try:
        body = '<h3><font color="#dd3023">Sky</font><font color="#6698FF">line</font><font color="black"> %s alert</font></h3><br>' % context
        body += '<font color="black">metric: <b>%s</b></font><br>' % metric[1]
        body += '<font color="black">Anomalous value: %s</font><br>' % str(
            metric[0])
        body += '<font color="black">Anomaly timestamp: %s</font><br>' % str(
            int(metric[2]))
        # @added 20170806 - Feature #1830: Ionosphere alerts
        # Show a human date in alerts
        body += '<font color="black">Anomalous at: %s</font><br>' % alerted_at
        body += '<font color="black">At hours: %s</font><br>' % str(
            int(full_duration_in_hours))
        body += '<font color="black">Next alert in: %s seconds</font><br>' % str(
            alert[2])
        # @added 20170603 - Feature #2034: analyse_derivatives
        if known_derivative_metric:
            body += '<font color="black">Derivative graph: True</font><br>'

        more_body = ''
        if settings.IONOSPHERE_ENABLED:
            # @modified 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls
            # Broke body into body and more_body to workaround the 990 character
            # limit per line for SMTP
            more_body += '<h3><font color="#dd3023">Ionosphere :: </font><font color="#6698FF">training data</font><font color="black"></font></h3>'
            ionosphere_link = '%s/ionosphere?timestamp=%s&metric=%s' % (
                settings.SKYLINE_URL, str(int(metric[2])), str(metric[1]))
            more_body += '<font color="black">To use this timeseries to train Skyline that this is not anomalous manage this training data at:<br>'
            more_body += '<a href="%s">%s</a></font>' % (ionosphere_link,
                                                         ionosphere_link)
        if redis_image_data:
            more_body += '<font color="black">min: %s  | max: %s   | mean: %s <br>' % (
                str(array_amin), str(array_amax), str(mean))
            more_body += '3-sigma: %s <br>' % str(sigma3)
            more_body += '3-sigma upper bound: %s   | 3-sigma lower bound: %s <br></font>' % (
                str(sigma3_upper_bound), str(sigma3_lower_bound))
            more_body += '<h3><font color="black">Redis data at FULL_DURATION</font></h3><br>'
            more_body += '<div dir="ltr">:%s<br></div>' % redis_img_tag
        if image_data:
            more_body += '<h3><font color="black">Graphite data at FULL_DURATION (may be aggregated)</font></h3>'
            more_body += '<div dir="ltr"><a href="%s">%s</a><br></div><br>' % (
                link, img_tag)
            more_body += '<font color="black">Clicking on the above graph will open to the Graphite graph with current data</font><br>'
        if redis_image_data:
            more_body += '<font color="black">To disable the Redis data graph view, set PLOT_REDIS_DATA to False in your settings.py, if the Graphite graph is sufficient for you,<br>'
            more_body += 'however do note that will remove the 3-sigma and mean value too.</font>'
        more_body += '<br>'
        more_body += '<div dir="ltr" align="right"><font color="#dd3023">Sky</font><font color="#6698FF">line</font><font color="black"> version :: %s</font></div><br>' % str(
            skyline_version)
    except:
        logger.error('error :: alert_smtp - could not build body')
        logger.info(traceback.format_exc())

    # @modified 20180524 - Task #2384: Change alerters to cc other recipients
    # Do not send to each recipient, send to primary_recipient and cc the other
    # recipients, thereby sending only one email
    # for recipient in recipients:
    if primary_recipient:
        try:
            # @modified 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls
            # Broke body into body and more_body to workaround the 990 character
            # limit per line for SMTP, using mixed as alternative indicates that
            # the client should select one of the parts for display and ignore
            # the rest (tripleee - https://stackoverflow.com/a/35115938)
            # msg = MIMEMultipart('alternative')
            msg = MIMEMultipart('mixed')

            # @added 20170812 - Bug #2142: 7bit SMTP encoding breaking long urls
            # set email charset and email encodings
            cs_ = charset.Charset('utf-8')
            cs_.header_encoding = charset.QP
            cs_.body_encoding = charset.QP
            msg.set_charset(cs_)

            msg['Subject'] = '[Skyline alert] - %s ALERT - %s' % (context,
                                                                  metric[1])
            msg['From'] = sender
            # @modified 20180524 - Task #2384: Change alerters to cc other recipients
            # msg['To'] = recipient
            msg['To'] = primary_recipient

            # @added 20180524 - Task #2384: Change alerters to cc other recipients
            # Added Cc
            if cc_recipients:
                msg['Cc'] = cc_recipients

            msg.attach(MIMEText(body, 'html'))
            # @added 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls
            # Broke body into body and more_body to workaround the 990 character
            # limit per line for SMTP
            msg.replace_header('content-transfer-encoding', 'quoted-printable')
            msg.attach(MIMEText(more_body, 'html'))

            if redis_image_data:
                try:
                    # @modified 20160814 - Bug #1558: Memory leak in Analyzer
                    # I think the buf is causing a memory leak, trying a file
                    # buf.seek(0)
                    # msg_plot_attachment = MIMEImage(buf.read())
                    # msg_plot_attachment = MIMEImage(buf.read())
                    try:
                        with open(buf, 'r') as f:
                            plot_image_data = f.read()
                        try:
                            os.remove(buf)
                        except OSError:
                            logger.error(
                                'error :: alert_smtp - failed to remove file - %s'
                                % buf)
                            logger.info(traceback.format_exc())
                            pass
                    except:
                        logger.error('error :: failed to read plot file - %s' %
                                     buf)
                        plot_image_data = None

                    # @added 20161124 - Branch #922: ionosphere
                    msg_plot_attachment = MIMEImage(plot_image_data)
                    msg_plot_attachment.add_header(
                        'Content-ID', '<%s>' % redis_graph_content_id)
                    msg.attach(msg_plot_attachment)
                    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                        logger.info(
                            'debug :: alert_smtp - msg_plot_attachment - redis data done'
                        )
                except:
                    logger.error('error :: alert_smtp - msg_plot_attachment')
                    logger.info(traceback.format_exc())

            if image_data is not None:
                try:
                    msg_attachment = MIMEImage(image_data)
                    msg_attachment.add_header('Content-ID',
                                              '<%s>' % content_id)
                    msg.attach(msg_attachment)
                    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                        logger.info(
                            'debug :: alert_smtp - msg_attachment - Graphite img source done'
                        )
                except:
                    logger.error('error :: alert_smtp - msg_attachment')
                    logger.info(traceback.format_exc())
        except:
            logger.error('error :: alert_smtp - could not attach')
            logger.info(traceback.format_exc())

        s = SMTP('127.0.0.1')
        try:
            # @modified 20180524 - Task #2384: Change alerters to cc other recipients
            # Send to primary_recipient and cc_recipients
            # s.sendmail(sender, recipient, msg.as_string())
            if cc_recipients:
                s.sendmail(sender, [primary_recipient, cc_recipients],
                           msg.as_string())
            else:
                s.sendmail(sender, primary_recipient, msg.as_string())
            if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                # logger.info('debug :: alert_smtp - message sent to %s OK' % str(recipient))
                logger.info(
                    'debug :: alert_smtp - message sent OK to primary_recipient :: %s, cc_recipients :: %s'
                    % (str(primary_recipient), str(cc_recipients)))
        except:
            logger.info(traceback.format_exc())
            # logger.error('error :: alert_smtp - could not send email to %s' % str(recipient))
            logger.error(
                'error :: alert_smtp - could not send email to primary_recipient :: %s, cc_recipients :: %s'
                % (str(primary_recipient), str(cc_recipients)))

        s.quit()

    if LOCAL_DEBUG:
        logger.info('debug :: alert_smtp - Memory usage after email: %s (kb)' %
                    resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    if redis_image_data:
        # buf.seek(0)
        # buf.write('none')
        if LOCAL_DEBUG:
            logger.info(
                'debug :: alert_smtp - Memory usage before del redis_image_data objects: %s (kb)'
                % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        del raw_series
        del unpacker
        del timeseries[:]
        del timeseries_x[:]
        del timeseries_y[:]
        del values
        del datetimes[:]
        del msg_plot_attachment
        del redis_image_data
        # We del all variables that are floats as they become unique objects and
        # can result in what appears to be a memory leak, but is not, it is
        # just the way Python handles floats
        del mean
        del array_amin
        del array_amax
        del stdDev
        del sigma3
        if LOCAL_DEBUG:
            logger.info(
                'debug :: alert_smtp - Memory usage after del redis_image_data objects: %s (kb)'
                % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        if LOCAL_DEBUG:
            logger.info(
                'debug :: alert_smtp - Memory usage before del fig object: %s (kb)'
                % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        # @added 20160814 - Bug #1558: Memory leak in Analyzer
        #                   Issue #21 Memory leak in Analyzer - https://github.com/earthgecko/skyline/issues/21
        # As per http://www.mail-archive.com/[email protected]/msg13222.html
        fig.clf()
        plt.close(fig)
        del fig
        if LOCAL_DEBUG:
            logger.info(
                'debug :: alert_smtp - Memory usage after del fig object: %s (kb)'
                % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    if LOCAL_DEBUG:
        logger.info(
            'debug :: alert_smtp - Memory usage before del other objects: %s (kb)'
            % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    del recipients[:]
    del body
    del msg
    del image_data
    del msg_attachment
    if LOCAL_DEBUG:
        logger.info(
            'debug :: alert_smtp - Memory usage after del other objects: %s (kb)'
            % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    return
Пример #7
0
    def spin_process(self, i, unique_metrics):
        """
        Assign a bunch of metrics for a process to analyze.

        Multiple get the assigned_metrics to the process from Redis.

        For each metric:

        - unpack the `raw_timeseries` for the metric.
        - Analyse each timeseries against `ALGORITHMS` to determine if it is
          anomalous.
        - If anomalous add it to the :obj:`self.anomalous_metrics` list
        - Add what algorithms triggered to the :obj:`self.anomaly_breakdown_q`
          queue
        - If :mod:`settings.ENABLE_CRUCIBLE` is ``True``:

          - Add a crucible data file with the details about the timeseries and
            anomaly.
          - Write the timeseries to a json file for crucible.

        Add keys and values to the queue so the parent process can collate for:\n
        * :py:obj:`self.anomaly_breakdown_q`
        * :py:obj:`self.exceptions_q`
        """

        spin_start = time()
        logger.info('spin_process started')

        # Discover assigned metrics
        keys_per_processor = int(ceil(float(len(unique_metrics)) / float(settings.ANALYZER_PROCESSES)))
        if i == settings.ANALYZER_PROCESSES:
            assigned_max = len(unique_metrics)
        else:
            assigned_max = min(len(unique_metrics), i * keys_per_processor)
        # Fix analyzer worker metric assignment #94
        # https://github.com/etsy/skyline/pull/94 @languitar:worker-fix
        assigned_min = (i - 1) * keys_per_processor
        assigned_keys = range(assigned_min, assigned_max)

        # Compile assigned metrics
        assigned_metrics = [unique_metrics[index] for index in assigned_keys]

        # Check if this process is unnecessary
        if len(assigned_metrics) == 0:
            return

        # Multi get series
        raw_assigned = self.redis_conn.mget(assigned_metrics)

        # Make process-specific dicts
        exceptions = defaultdict(int)
        anomaly_breakdown = defaultdict(int)

        # Distill timeseries strings into lists
        for i, metric_name in enumerate(assigned_metrics):
            self.check_if_parent_is_alive()

            try:
                raw_series = raw_assigned[i]
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)

                anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_name)

                # If it's anomalous, add it to list
                if anomalous:
                    base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1)
                    metric = [datapoint, base_name]
                    self.anomalous_metrics.append(metric)

                    # Get the anomaly breakdown - who returned True?
                    triggered_algorithms = []
                    for index, value in enumerate(ensemble):
                        if value:
                            algorithm = settings.ALGORITHMS[index]
                            anomaly_breakdown[algorithm] += 1
                            triggered_algorithms.append(algorithm)

                    # If Crucible or Panorama are enabled determine details
                    determine_anomaly_details = False
                    if settings.ENABLE_CRUCIBLE and settings.ANALYZER_CRUCIBLE_ENABLED:
                        determine_anomaly_details = True
                    if settings.PANORAMA_ENABLED:
                        determine_anomaly_details = True

                    if determine_anomaly_details:
                        metric_timestamp = str(int(timeseries[-1][0]))
                        from_timestamp = str(int(timeseries[1][0]))
                        timeseries_dir = base_name.replace('.', '/')

                    # If Panorama is enabled - create a Panorama check
                    if settings.PANORAMA_ENABLED:
                        if not os.path.exists(settings.PANORAMA_CHECK_PATH):
                            if python_version == 2:
                                mode_arg = int('0755')
                            if python_version == 3:
                                mode_arg = mode=0o755
                            os.makedirs(settings.PANORAMA_CHECK_PATH, mode_arg)

                        # Note:
                        # The values are enclosed is single quoted intentionally
                        # as the imp.load_source used results in a shift in the
                        # decimal position when double quoted, e.g.
                        # value = "5622.0" gets imported as
                        # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
                        # single quoting results in the desired,
                        # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0
                        added_at = str(int(time()))
                        source = 'graphite'
                        panaroma_anomaly_data = 'metric = \'%s\'\n' \
                                                'value = \'%s\'\n' \
                                                'from_timestamp = \'%s\'\n' \
                                                'metric_timestamp = \'%s\'\n' \
                                                'algorithms = %s\n' \
                                                'triggered_algorithms = %s\n' \
                                                'app = \'%s\'\n' \
                                                'source = \'%s\'\n' \
                                                'added_by = \'%s\'\n' \
                                                'added_at = \'%s\'\n' \
                            % (base_name, str(datapoint), from_timestamp,
                               metric_timestamp, str(settings.ALGORITHMS),
                               triggered_algorithms, skyline_app, source,
                               this_host, added_at)

                        # Create an anomaly file with details about the anomaly
                        panaroma_anomaly_file = '%s/%s.%s.txt' % (
                            settings.PANORAMA_CHECK_PATH, added_at,
                            base_name)
                        try:
                            write_data_to_file(
                                skyline_app, panaroma_anomaly_file, 'w',
                                panaroma_anomaly_data)
                            logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file))
                        except:
                            logger.error('error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file))
                            logger.info(traceback.format_exc())

                    # If Crucible is enabled - save timeseries and create a
                    # Crucible check
                    if settings.ENABLE_CRUCIBLE and settings.ANALYZER_CRUCIBLE_ENABLED:
                        crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp
                        if not os.path.exists(crucible_anomaly_dir):
                            if python_version == 2:
                                mode_arg = int('0755')
                            if python_version == 3:
                                mode_arg = mode=0o755
                            os.makedirs(crucible_anomaly_dir, mode_arg)

                        # Note:
                        # The values are enclosed is single quoted intentionally
                        # as the imp.load_source used in crucible results in a
                        # shift in the decimal position when double quoted, e.g.
                        # value = "5622.0" gets imported as
                        # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
                        # single quoting results in the desired,
                        # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0

                        crucible_anomaly_data = 'metric = \'%s\'\n' \
                                                'value = \'%s\'\n' \
                                                'from_timestamp = \'%s\'\n' \
                                                'metric_timestamp = \'%s\'\n' \
                                                'algorithms = %s\n' \
                                                'triggered_algorithms = %s\n' \
                                                'anomaly_dir = \'%s\'\n' \
                                                'graphite_metric = True\n' \
                                                'run_crucible_tests = False\n' \
                                                'added_by = \'%s\'\n' \
                                                'added_at = \'%s\'\n' \
                            % (base_name, str(datapoint), from_timestamp,
                               metric_timestamp, str(settings.ALGORITHMS),
                               triggered_algorithms, crucible_anomaly_dir,
                               skyline_app, metric_timestamp)

                        # Create an anomaly file with details about the anomaly
                        crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, base_name)
                        try:
                            write_data_to_file(
                                skyline_app, crucible_anomaly_file, 'w',
                                crucible_anomaly_data)
                            logger.info('added crucible anomaly file :: %s' % (crucible_anomaly_file))
                        except:
                            logger.error('error :: failed to add crucible anomaly file :: %s' % (crucible_anomaly_file))
                            logger.info(traceback.format_exc())

                        # Create timeseries json file with the timeseries
                        json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name)
                        timeseries_json = str(timeseries).replace('[', '(').replace(']', ')')
                        try:
                            write_data_to_file(skyline_app, json_file, 'w', timeseries_json)
                            logger.info('added crucible timeseries file :: %s' % (json_file))
                        except:
                            logger.error('error :: failed to add crucible timeseries file :: %s' % (json_file))
                            logger.info(traceback.format_exc())

                        # Create a crucible check file
                        crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name)
                        try:
                            write_data_to_file(
                                skyline_app, crucible_check_file, 'w',
                                crucible_anomaly_data)
                            logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp))
                        except:
                            logger.error('error :: failed to add crucible check file :: %s' % (crucible_check_file))
                            logger.info(traceback.format_exc())

            # It could have been deleted by the Roomba
            except TypeError:
                exceptions['DeletedByRoomba'] += 1
            except TooShort:
                exceptions['TooShort'] += 1
            except Stale:
                exceptions['Stale'] += 1
            except Boring:
                exceptions['Boring'] += 1
            except:
                exceptions['Other'] += 1
                logger.info(traceback.format_exc())

        # Add values to the queue so the parent process can collate
        for key, value in anomaly_breakdown.items():
            self.anomaly_breakdown_q.put((key, value))

        for key, value in exceptions.items():
            self.exceptions_q.put((key, value))

        spin_end = time() - spin_start
        logger.info('spin_process took %.2f seconds' % spin_end)
Пример #8
0
def alert_slack(datapoint, metric_name, expiration_time, metric_trigger,
                algorithm):

    if not settings.SLACK_ENABLED:
        return False

    from slackclient import SlackClient
    metric = metric_name
    logger.info('alert_slack - anomalous metric :: metric: %s - %s' %
                (metric, algorithm))
    base_name = metric
    alert_algo = str(algorithm)
    alert_context = alert_algo.upper()

    # The known_derivative_metric state is determine in case we need to surface
    # the png image from Graphite if the Ionosphere image is not available for
    # some reason.  This will result in Skyline at least still sending an alert
    # to slack, even if some gear fails in Ionosphere or slack alerting is used
    # without Ionosphere enabled. Yes not DRY but multiprocessing and spawn
    # safe.
    known_derivative_metric = False

    #    try:
    #        if settings.REDIS_PASSWORD:
    #            # @modified 20191022 - Bug #3266: py3 Redis binary objects not strings
    #            #                      Branch #3262: py3
    #            # REDIS_ALERTER_CONN = redis.StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
    #            REDIS_ALERTER_CONN = redis.StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True)
    #        else:
    #            # REDIS_ALERTER_CONN = redis.StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
    #            REDIS_ALERTER_CONN = redis.StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True)
    #    except:
    #        logger.error('error :: alert_slack - redis connection failed')
    #    try:
    #        derivative_metrics = list(REDIS_ALERTER_CONN.smembers('derivative_metrics'))
    #    except:
    #        derivative_metrics = []
    redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(base_name))
    #    if redis_metric_name in derivative_metrics:
    #        known_derivative_metric = True
    known_derivative_metric = is_derivative_metric(skyline_app, str(base_name))

    # if known_derivative_metric:
    #     try:
    #         non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS
    #     except:
    #         non_derivative_monotonic_metrics = []
    #     skip_derivative = in_list(redis_metric_name, non_derivative_monotonic_metrics)
    #     if skip_derivative:
    #         known_derivative_metric = False

    # @added 20191008 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings
    try:
        main_alert_title = settings.CUSTOM_ALERT_OPTS['main_alert_title']
    except:
        main_alert_title = 'Skyline'
    try:
        app_alert_context = settings.CUSTOM_ALERT_OPTS[
            'boundary_alert_heading']
    except:
        app_alert_context = 'Boundary'

    if known_derivative_metric:
        # @modified 20191008 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings
        # unencoded_graph_title = 'Skyline Boundary - ALERT %s at %s hours - derivative graph - %s' % (
        #     alert_context, str(graphite_previous_hours), metric)
        # slack_title = '*Skyline Boundary - ALERT* %s on %s at %s hours - derivative graph - %s' % (
        #     alert_context, metric, str(graphite_previous_hours), datapoint)
        unencoded_graph_title = '%s %s - ALERT %s at %s hours - derivative graph - %s' % (
            main_alert_title, app_alert_context, alert_context,
            str(graphite_previous_hours), metric)
        slack_title = '*%s %s - ALERT* %s on %s at %s hours - derivative graph - %s' % (
            main_alert_title, app_alert_context, alert_context, metric,
            str(graphite_previous_hours), datapoint)
    else:
        # unencoded_graph_title = 'Skyline Boundary - ALERT %s at %s hours - %s' % (
        #     alert_context, str(graphite_previous_hours), metric)
        # slack_title = '*Skyline Boundary - ALERT* %s on %s at %s hours - %s' % (
        #     alert_context, metric, str(graphite_previous_hours), datapoint)
        unencoded_graph_title = '%s %s - ALERT %s at %s hours - %s' % (
            main_alert_title, app_alert_context, alert_context,
            str(graphite_previous_hours), metric)
        slack_title = '*%s %s - ALERT* %s on %s at %s hours - %s' % (
            main_alert_title, app_alert_context, alert_context, metric,
            str(graphite_previous_hours), datapoint)

    graph_title_string = quote(unencoded_graph_title, safe='')
    graph_title = '&title=%s' % graph_title_string

    until_timestamp = int(time())
    target_seconds = int((graphite_previous_hours * 60) * 60)
    from_timestamp = str(until_timestamp - target_seconds)

    graphite_from = dt.datetime.fromtimestamp(
        int(from_timestamp)).strftime('%H:%M_%Y%m%d')
    logger.info('graphite_from - %s' % str(graphite_from))
    graphite_until = dt.datetime.fromtimestamp(
        int(until_timestamp)).strftime('%H:%M_%Y%m%d')
    logger.info('graphite_until - %s' % str(graphite_until))
    # @added 20181025 - Feature #2618: alert_slack
    # Added date and time info so you do not have to mouseover the slack
    # message to determine the time at which the alert came in
    timezone = strftime("%Z", gmtime())
    # @modified 20181029 - Feature #2618: alert_slack
    # Use the standard UNIX data format
    # human_anomaly_time = dt.datetime.fromtimestamp(int(until_timestamp)).strftime('%Y-%m-%d %H:%M:%S')
    human_anomaly_time = dt.datetime.fromtimestamp(
        int(until_timestamp)).strftime('%c')
    slack_time_string = '%s %s' % (human_anomaly_time, timezone)

    # @added 20191106 - Branch #3262: py3
    #                   Branch #3002: docker
    graphite_port = get_graphite_port(skyline_app)
    graphite_render_uri = get_graphite_render_uri(skyline_app)
    graphite_custom_headers = get_graphite_custom_headers(skyline_app)

    if settings.GRAPHITE_PORT != '':
        if known_derivative_metric:
            # @modified 20190520 - Branch #3002: docker
            # Use GRAPHITE_RENDER_URI
            # link = '%s://%s:%s/render/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % (
            #     settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
            #     settings.GRAPHITE_PORT, str(graphite_from), str(graphite_until),
            #     metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
            # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle
            # link = '%s://%s:%s/%s/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % (
            link = '%s://%s:%s/%s/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s),%%27si%%27)%s%s&colorList=orange' % (
                settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
                settings.GRAPHITE_PORT, settings.GRAPHITE_RENDER_URI,
                str(graphite_from), str(graphite_until), metric,
                settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
        else:
            # @modified 20190520 - Branch #3002: docker
            # Use GRAPHITE_RENDER_URI
            # link = '%s://%s:%s/render/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % (
            #     settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
            #     settings.GRAPHITE_PORT, str(graphite_from), str(graphite_until),
            #     metric, settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
            # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle
            # link = '%s://%s:%s/%s/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % (
            link = '%s://%s:%s/%s/?from=%s&until=%s&target=cactiStyle(%s,%%27si%%27)%s%s&colorList=orange' % (
                settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
                settings.GRAPHITE_PORT, settings.GRAPHITE_RENDER_URI,
                str(graphite_from), str(graphite_until), metric,
                settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
    else:
        if known_derivative_metric:
            # @modified 20190520 - Branch #3002: docker
            # Use GRAPHITE_RENDER_URI
            # link = '%s://%s/render/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % (
            #     settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
            #     str(graphite_from), str(graphite_until), metric,
            #     settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
            # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle
            # link = '%s://%s/%s/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % (
            link = '%s://%s/%s/?from=%s&until=%s&target=cactiStyle(nonNegativeDerivative(%s),%%27si%%27)%s%s&colorList=orange' % (
                settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
                settings.GRAPHITE_RENDER_URI, str(graphite_from),
                str(graphite_until), metric, settings.GRAPHITE_GRAPH_SETTINGS,
                graph_title)
        else:
            # @modified 20190520 - Branch #3002: docker
            # Use GRAPHITE_RENDER_URI
            # link = '%s://%s/render/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % (
            #     settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
            #     str(graphite_from), str(graphite_until), metric,
            #     settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
            # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle
            # link = '%s://%s/%s/?from=%s&until=%s&target=cactiStyle(%s)%s%s&colorList=orange' % (
            link = '%s://%s/%s/?from=%s&until=%s&target=cactiStyle(%s,%%27si%%27)%s%s&colorList=orange' % (
                settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
                settings.GRAPHITE_RENDER_URI, str(graphite_from),
                str(graphite_until), metric, settings.GRAPHITE_GRAPH_SETTINGS,
                graph_title)

    # slack does not allow embedded images, nor will it fetch links behind
    # authentication so Skyline uploads a png graphite image with the message
    image_file = None

    # Fetch the png from Graphite
    # @modified 20191021 - Task #3290: Handle urllib2 in py3
    #                      Branch #3262: py3
    image_file = '%s/%s.%s.graphite.%sh.png' % (
        settings.SKYLINE_TMP_DIR, base_name, skyline_app,
        str(int(graphite_previous_hours)))

    if python_version == 22:
        try:
            # image_data = urllib2.urlopen(link).read()  # nosec
            image_data = None
        # except urllib2.URLError:
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: alert_slack - failed to get image graph')
            logger.error('error :: alert_slack - %s' % str(link))
            image_data = None
    if python_version == 33:
        try:
            image_file = '%s/%s.%s.graphite.%sh.png' % (
                settings.SKYLINE_TMP_DIR, base_name, skyline_app,
                str(int(graphite_previous_hours)))
            #            urllib.request.urlretrieve(link, image_file)
            image_data = 'retrieved'
            image_data = None
        except:
            try:
                # @added 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle
                image_data = None
                original_traceback = traceback.format_exc()
                if 'cactiStyle' in link:
                    metric_replace = '%s,%%27si%%27' % metric
                    original_link = link
                    link = link.replace(metric, metric_replace)
                    logger.info(
                        'link replaced with cactiStyle system parameter added - %s'
                        % str(link))
                    urllib.request.urlretrieve(link, image_file)
                    image_data = 'retrieved'
            except:
                new_trackback = traceback.format_exc()
                logger.error(original_traceback)
                logger.error(
                    'error :: boundary_alerters :: alert_slack :: failed to urlopen %s'
                    % str(original_link))
                logger.error(new_trackback)
                logger.error(
                    'error :: boundary_alerters :: alert_slack :: failed to urlopen with system parameter added %s'
                    % str(link))
                image_data = None

    # @added 20191025 -
    image_data = get_graphite_graph_image(skyline_app, link, image_file)

    if image_data == 'disabled_for_testing':
        image_file = '%s/%s.%s.graphite.%sh.png' % (
            settings.SKYLINE_TMP_DIR, base_name, skyline_app,
            str(int(graphite_previous_hours)))
        if image_data != 'retrieved':
            try:
                write_data_to_file(skyline_app, image_file, 'w', image_data)
                logger.info('alert_slack - added Graphite image :: %s' %
                            (image_file))
            except:
                logger.info(traceback.format_exc())
                logger.error(
                    'error :: alert_slack - failed to add %s Graphite image' %
                    (image_file))
                image_file = None
    try:
        filename = os.path.basename(image_file)
    except:
        filename = None

    try:
        bot_user_oauth_access_token = settings.BOUNDARY_SLACK_OPTS[
            'bot_user_oauth_access_token']
    except:
        logger.error(
            'error :: alert_slack - could not determine bot_user_oauth_access_token'
        )
        return False

    # Allow for absolute path metric namespaces but also allow for and match
    # match wildcard namepaces if there is not an absolute path metric namespace
    channels = 'unknown'
    notify_channels = []
    matched_channels = []
    try:
        channels = settings.BOUNDARY_SLACK_OPTS['channels'][metric_name]
        notify_channels.append(channels)
    except:
        for channel in settings.BOUNDARY_SLACK_OPTS['channels']:
            CHECK_MATCH_PATTERN = channel
            check_match_pattern = re.compile(CHECK_MATCH_PATTERN)
            pattern_match = check_match_pattern.match(metric_name)
            if pattern_match:
                matched_channels.append(channel)

    if matched_channels != []:
        for i_metric_name in matched_channels:
            channels = settings.BOUNDARY_SLACK_OPTS['channels'][i_metric_name]
            notify_channels.append(channels)

    if not notify_channels:
        logger.error('error :: alert_slack - could not determine channel')
        return False
    else:
        channels = notify_channels

    try:
        icon_emoji = settings.BOUNDARY_SLACK_OPTS['icon_emoji']
    except:
        icon_emoji = ':chart_with_upwards_trend:'

    try:
        sc = SlackClient(bot_user_oauth_access_token)
    except:
        logger.info(traceback.format_exc())
        logger.error('error :: alert_slack - could not initiate SlackClient')
        return False

    for channel in channels:
        initial_comment = slack_title + ' :: <' + link + '|graphite image link>\nFor anomaly at ' + slack_time_string
        try:
            # slack does not allow embedded images, nor links behind authentication
            # or color text, so we have jump through all the API hoops to end up
            # having to upload an image with a very basic message.
            if os.path.isfile(image_file):
                slack_file_upload = sc.api_call(
                    'files.upload',
                    filename=filename,
                    channels=channel,
                    initial_comment=initial_comment,
                    file=open(image_file, 'rb'))
                if not slack_file_upload['ok']:
                    logger.error(
                        'error :: alert_slack - failed to send slack message with file upload'
                    )
                    logger.error(
                        'error :: alert_slack - slack_file_upload - %s' %
                        str(slack_file_upload))
                try:
                    os.remove(image_file)
                except OSError:
                    logger.error('error - failed to remove %s, continuing' %
                                 image_file)
                    pass
            else:
                send_text = initial_comment + '  ::  error :: there was no graph image to upload'
                send_message = sc.api_call('chat.postMessage',
                                           channel=channel,
                                           icon_emoji=icon_emoji,
                                           text=send_text)
                if not send_message['ok']:
                    logger.error(
                        'error :: alert_slack - failed to send slack message')
                else:
                    logger.info('alert_slack - sent slack message')
        except:
            logger.info(traceback.format_exc())
            logger.error('error :: alert_slack - could not upload file')
            return False
Пример #9
0
def create_features_profile(current_skyline_app, requested_timestamp, data_for_metric, context, ionosphere_job, fp_parent_id, fp_generation, fp_learn):
    """
    Add a features_profile to the Skyline ionosphere database table.

    :param current_skyline_app: Skyline app name
    :param requested_timestamp: The timestamp of the dir that the features
        profile data is in
    :param data_for_metric: The base_name of the metric
    :param context: The context of the caller
    :param ionosphere_job: The ionosphere_job name related to creation request
        valid jobs are ``learn_fp_human``, ``learn_fp_generation``,
        ``learn_fp_learnt`` and ``learn_fp_automatic``.
    :param fp_parent_id: The id of the parent features profile that this was
        learnt from, 0 being an original human generated features profile
    :param fp_generation: The number of generations away for the original
        human generated features profile, 0 being an original human generated
        features profile.
    :param fp_learn: Whether Ionosphere should learn at use_full_duration_days
    :type current_skyline_app: str
    :type requested_timestamp: int
    :type data_for_metric: str
    :type context: str
    :type ionosphere_job: str
    :type fp_parent_id: int
    :type fp_generation: int
    :type fp_learn: boolean
    :return: fp_id, fp_in_successful, fp_exists, fail_msg, traceback_format_exc
    :rtype: str, boolean, boolean, str, str

    """

    current_skyline_app_logger = current_skyline_app + 'Log'
    current_logger = logging.getLogger(current_skyline_app_logger)

    base_name = data_for_metric.replace(settings.FULL_NAMESPACE, '', 1)

    if context == 'training_data':
        log_context = 'training data'
        ionosphere_learn_job = 'learn_fp_human'
    if context == 'features_profiles':
        log_context = 'features profile data'

    # @added 20170113 - Feature #1854: Ionosphere learn
    if context == 'ionosphere_learn':
        log_context = 'learn'

    current_logger.info('create_features_profile :: %s :: requested for %s at %s' % (
        context, str(base_name), str(requested_timestamp)))

    metric_timeseries_dir = base_name.replace('.', '/')
    if context == 'training_data':
        metric_training_data_dir = '%s/%s/%s' % (
            settings.IONOSPHERE_DATA_FOLDER, str(requested_timestamp),
            metric_timeseries_dir)
    if context == 'features_profiles':
        metric_training_data_dir = '%s/%s/%s' % (
            settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir,
            str(requested_timestamp))

    # @added 20170113 - Feature #1854: Ionosphere learn
    if context == 'ionosphere_learn':
        # @modified 20170116 - Feature #1854: Ionosphere learn
        # Allowing ionosphere_learn to create a features profile for a training
        # data set that it has learnt is not anomalous
        if ionosphere_job != 'learn_fp_automatic':
            metric_training_data_dir = '%s/%s/%s' % (
                settings.IONOSPHERE_LEARN_FOLDER, str(requested_timestamp),
                metric_timeseries_dir)
        else:
            metric_training_data_dir = '%s/%s/%s' % (
                settings.IONOSPHERE_DATA_FOLDER, str(requested_timestamp),
                metric_timeseries_dir)

    features_file = '%s/%s.tsfresh.input.csv.features.transposed.csv' % (
        metric_training_data_dir, base_name)

    features_profile_dir = '%s/%s' % (
        settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir)

    ts_features_profile_dir = '%s/%s/%s' % (
        settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir,
        str(requested_timestamp))

    features_profile_created_file = '%s/%s.%s.fp.created.txt' % (
        metric_training_data_dir, str(requested_timestamp), base_name)

    features_profile_details_file = '%s/%s.%s.fp.details.txt' % (
        metric_training_data_dir, str(requested_timestamp), base_name)

    anomaly_check_file = '%s/%s.txt' % (metric_training_data_dir, base_name)

    trace = 'none'
    fail_msg = 'none'
    new_fp_id = False
    calculated_with_tsfresh = False
    calculated_time = False
    fcount = None
    fsum = None
    # @added 20170104 - Feature #1842: Ionosphere - Graphite now graphs
    # Added the ts_full_duration parameter so that the appropriate graphs can be
    # embedded for the user in the training data page
    ts_full_duration = '0'

    if context == 'ionosphere_learn':
        if not path.isfile(features_profile_details_file):
            current_logger.error('error :: create_features_profile :: no features_profile_details_file - %s' % features_profile_details_file)
            return 'none', False, False, fail_msg, trace

    if path.isfile(features_profile_details_file):
        current_logger.info('create_features_profile :: getting features profile details from from - %s' % features_profile_details_file)
        # Read the details file
        with open(features_profile_details_file, 'r') as f:
            fp_details_str = f.read()
        fp_details = literal_eval(fp_details_str)
        calculated_with_tsfresh = fp_details[1]
        calculated_time = str(fp_details[2])
        fcount = str(fp_details[3])
        fsum = str(fp_details[4])
        try:
            ts_full_duration = str(fp_details[5])
        except:
            current_logger.error('error :: create_features_profile :: could not determine the full duration from - %s' % features_profile_details_file)
            ts_full_duration = '0'

        if context != 'ionosphere_learn':
            if ts_full_duration == '0':
                if path.isfile(anomaly_check_file):
                    current_logger.info('create_features_profile :: determining the full duration from anomaly_check_file - %s' % anomaly_check_file)
                    # Read the details file
                    with open(anomaly_check_file, 'r') as f:
                        anomaly_details = f.readlines()
                        for i, line in enumerate(anomaly_details):
                            if 'full_duration' in line:
                                _ts_full_duration = '%s' % str(line).split("'", 2)
                                full_duration_array = literal_eval(_ts_full_duration)
                                ts_full_duration = str(int(full_duration_array[1]))
                                current_logger.info('create_features_profile :: determined the full duration as - %s' % str(ts_full_duration))

    if path.isfile(features_profile_created_file):
        # Read the created file
        with open(features_profile_created_file, 'r') as f:
            fp_created_str = f.read()
        fp_created = literal_eval(fp_created_str)
        new_fp_id = fp_created[0]

        return str(new_fp_id), True, True, fail_msg, trace

    # Have data
    if path.isfile(features_file):
        current_logger.info('create_features_profile :: features_file exists: %s' % features_file)
    else:
        trace = traceback.format_exc()
        current_logger.error(trace)
        fail_msg = 'error :: create_features_profile :: features_file does not exist: %s' % features_file
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            return False, False, False, fail_msg, trace

    features_data = []
    with open(features_file, 'rb') as fr:
        reader = csv.reader(fr, delimiter=',')
        for i, line in enumerate(reader):
            feature_name_item = False
            fname_id = False
            f_value = False
            feature_name = str(line[0])
            feature_name_item = filter(
                lambda x: x[1] == feature_name, TSFRESH_FEATURES)
            if feature_name_item:
                feature_name_id = feature_name_item[0]
            if feature_name_item:
                feature_name_list = feature_name_item[0]
                fname_id = int(feature_name_list[0])
            f_value = str(line[1])
            if fname_id and f_value:
                features_data.append([fname_id, f_value])

    # @added 20170113 - Feature #1854: Ionosphere learn - generations
    # Set the learn generations variables with the IONOSPHERE_LEARN_DEFAULT_ and any
    # settings.IONOSPHERE_LEARN_NAMESPACE_CONFIG values.  These will later be
    # overridden by any database values determined for the specific metric if
    # they exist.
    # Set defaults
    use_full_duration_days = int(settings.IONOSPHERE_LEARN_DEFAULT_FULL_DURATION_DAYS)
    valid_learning_duration = int(settings.IONOSPHERE_LEARN_DEFAULT_VALID_TIMESERIES_OLDER_THAN_SECONDS)
    max_generations = int(settings.IONOSPHERE_LEARN_DEFAULT_MAX_GENERATIONS)
    max_percent_diff_from_origin = float(settings.IONOSPHERE_LEARN_DEFAULT_MAX_PERCENT_DIFF_FROM_ORIGIN)
    try:
        use_full_duration, valid_learning_duration, use_full_duration_days, max_generations, max_percent_diff_from_origin = get_ionosphere_learn_details(current_skyline_app, base_name)
        learn_full_duration_days = use_full_duration_days
    except:
        current_logger.error(traceback.format_exc())
        current_logger.error('error :: create_features_profile :: failed to get_ionosphere_learn_details')

    current_logger.info('create_features_profile :: learn_full_duration_days     :: %s days' % (str(learn_full_duration_days)))
    current_logger.info('create_features_profile :: valid_learning_duration      :: %s seconds' % (str(valid_learning_duration)))
    current_logger.info('create_features_profile :: max_generations              :: %s' % (str(max_generations)))
    current_logger.info('create_features_profile :: max_percent_diff_from_origin :: %s' % (str(max_percent_diff_from_origin)))

    current_logger.info('create_features_profile :: getting MySQL engine')
    try:
        engine, fail_msg, trace = fp_create_get_an_engine(current_skyline_app)
        current_logger.info(fail_msg)
    except:
        trace = traceback.format_exc()
        current_logger.error(trace)
        fail_msg = 'error :: create_features_profile :: could not get a MySQL engine'
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            return False, False, False, fail_msg, trace

    if not engine:
        trace = 'none'
        fail_msg = 'error :: create_features_profile :: engine not obtained'
        current_logger.error(fail_msg)
        if context == 'training' or context == 'features_profile':
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            return False, False, False, fail_msg, trace

    # Get metric details from the database
    metrics_id = False

    # Use the learn details as per config
    metric_learn_full_duration_days = int(use_full_duration_days)
    metric_learn_valid_ts_older_than = int(valid_learning_duration)
    metric_max_generations = int(max_generations)
    metric_max_percent_diff_from_origin = int(max_percent_diff_from_origin)

    metrics_table = None
    try:
        metrics_table, fail_msg, trace = metrics_table_meta(current_skyline_app, engine)
        current_logger.info(fail_msg)
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to get metrics_table meta for %s' % base_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: disposing of any engine')
            fp_create_engine_disposal(current_skyline_app, engine)
            return False, False, False, fail_msg, trace

    current_logger.info('create_features_profile :: metrics_table OK')

    metric_db_object = None
    try:
        connection = engine.connect()
        # @modified 20161209 -  - Branch #922: ionosphere
        #                        Task #1658: Patterning Skyline Ionosphere
        # result = connection.execute('select id from metrics where metric=\'%s\'' % base_name)
#        for row in result:
#            while not metrics_id:
#                metrics_id = row['id']
        stmt = select([metrics_table]).where(metrics_table.c.metric == base_name)
        result = connection.execute(stmt)
        for row in result:
            metrics_id = row['id']
            # @added 20170113 - Feature #1854: Ionosphere learn - generations
            # Added Ionosphere LEARN generation related variables
            try:
                metric_learn_full_duration_days = int(row['learn_full_duration_days'])
                metric_learn_valid_ts_older_than = int(row['learn_valid_ts_older_than'])
                metric_max_generations = int(row['max_generations'])
                metric_max_percent_diff_from_origin = float(row['max_percent_diff_from_origin'])
            except:
                current_logger.error('error :: create_features_profile :: failed to determine learn related values from DB for %s' % base_name)
        row = result.fetchone()
        # metric_db_object = row
        connection.close()
        current_logger.info('create_features_profile :: determined db metric id: %s' % str(metrics_id))
        current_logger.info('create_features_profile :: determined db metric learn_full_duration_days: %s' % str(metric_learn_full_duration_days))
        current_logger.info('create_features_profile :: determined db metric learn_valid_ts_older_than: %s' % str(metric_learn_valid_ts_older_than))
        current_logger.info('create_features_profile :: determined db metric max_generations: %s' % str(metric_max_generations))
        current_logger.info('create_features_profile :: determined db metric max_percent_diff_from_origin: %s' % str(metric_max_percent_diff_from_origin))
    except:
        trace = traceback.format_exc()
        current_logger.error(trace)
        fail_msg = 'error :: create_features_profile :: could not determine id of metric from DB: %s' % base_name
        current_logger.error('%s' % fail_msg)

    if metric_learn_full_duration_days:
        learn_full_duration_days = metric_learn_full_duration_days
        # learn_full_duration = int(learn_full_duration_days) * 86400
    if metric_learn_valid_ts_older_than:
        learn_valid_ts_older_than = metric_learn_valid_ts_older_than
    if metric_max_generations:
        max_generations = metric_max_generations
    if metric_max_percent_diff_from_origin:
        max_percent_diff_from_origin = metric_max_percent_diff_from_origin
    current_logger.info('create_features_profile :: generation info - learn_full_duration_days     :: %s' % (str(learn_full_duration_days)))
    current_logger.info('create_features_profile :: generation info - learn_valid_ts_older_than    :: %s' % (str(learn_valid_ts_older_than)))
    current_logger.info('create_features_profile :: generation info - max_generations              :: %s' % (str(max_generations)))
    current_logger.info('create_features_profile :: generation info - max_percent_diff_from_origin :: %s' % (str(max_percent_diff_from_origin)))

    # @added 20170120 - Feature #1854: Ionosphere learn
    # Always use the timestamp from the anomaly file
    use_anomaly_timestamp = int(requested_timestamp)
    if context == 'ionosphere_learn':
        if path.isfile(anomaly_check_file):
            current_logger.info('create_features_profile :: determining the full duration from anomaly_check_file - %s' % anomaly_check_file)
            # Read the details file
            with open(anomaly_check_file, 'r') as f:
                anomaly_details = f.readlines()
                for i, line in enumerate(anomaly_details):
                    if 'metric_timestamp' in line:
                        _metric_timestamp = '%s' % str(line).split("'", 2)
                        metric_timestamp_array = literal_eval(_metric_timestamp)
                        use_anomaly_timestamp = (int(metric_timestamp_array[1]))
                        current_logger.info('create_features_profile :: determined the anomaly metric_timestamp as - %s' % str(use_anomaly_timestamp))

    ionosphere_table = None
    try:
        ionosphere_table, fail_msg, trace = ionosphere_table_meta(current_skyline_app, engine)
        current_logger.info(fail_msg)
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to get ionosphere_table meta for %s' % base_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # Raise to webbapp I believe to provide traceback to user in UI
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            raise
        else:
            current_logger.info('create_features_profile :: disposing of any engine')
            fp_create_engine_disposal(current_skyline_app, engine)
            return False, False, False, fail_msg, trace

    current_logger.info('create_features_profile :: ionosphere_table OK')

    # @added 20170403 - Feature #2000: Ionosphere - validated
    # Set all learn_fp_human features profiles to validated.
    fp_validated = 0
    if ionosphere_job == 'learn_fp_human':
        fp_validated = 1

    # @added 20170424 - Feature #2000: Ionosphere - validated
    # Set all generation 0 and 1 as validated
    if int(fp_generation) <= 1:
        fp_validated = 1

    new_fp_id = False
    try:
        connection = engine.connect()
        # @added 20170113 - Feature #1854: Ionosphere learn
        # Added learn values parent_id, generation
        # @modified 20170120 - Feature #1854: Ionosphere learn
        # Added anomaly_timestamp
        # @modified 20170403 - Feature #2000: Ionosphere - validated
        ins = ionosphere_table.insert().values(
            metric_id=int(metrics_id), full_duration=int(ts_full_duration),
            anomaly_timestamp=int(use_anomaly_timestamp),
            enabled=1, tsfresh_version=str(tsfresh_version),
            calc_time=calculated_time, features_count=fcount,
            features_sum=fsum, parent_id=fp_parent_id,
            generation=fp_generation, validated=fp_validated)
        result = connection.execute(ins)
        connection.close()
        new_fp_id = result.inserted_primary_key[0]
        current_logger.info('create_features_profile :: new ionosphere fp_id: %s' % str(new_fp_id))
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to insert a new record into the ionosphere table for %s' % base_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: disposing of any engine')
            fp_create_engine_disposal(current_skyline_app, engine)
            return False, False, False, fail_msg, trace

    if not RepresentsInt(new_fp_id):
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: unknown new ionosphere new_fp_id for %s' % base_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: disposing of any engine')
            fp_create_engine_disposal(current_skyline_app, engine)
            return False, False, False, fail_msg, trace

    # Create z_fp_<metric_id> table
    fp_table_created = False
    fp_table_name = 'z_fp_%s' % str(metrics_id)
    try:
        fp_meta = MetaData()
        # @modified 20161222 - Task #1812: z_fp table type
        # Changed to InnoDB from MyISAM as no files open issues and MyISAM clean
        # up, there can be LOTS of file_per_table z_fp_ tables/files without
        # the MyISAM issues.  z_fp_ tables are mostly read and will be shuffled
        # in the table cache as required.
        fp_metric_table = Table(
            fp_table_name, fp_meta,
            Column('id', Integer, primary_key=True),
            Column('fp_id', Integer, nullable=False, key='fp_id'),
            Column('feature_id', Integer, nullable=False),
            Column('value', DOUBLE(), nullable=True),
            mysql_charset='utf8',
            mysql_key_block_size='255',
            mysql_engine='InnoDB')
        fp_metric_table.create(engine, checkfirst=True)
        fp_table_created = True
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to create table - %s' % fp_table_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context)

    if not fp_table_created:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to determine True for create table - %s' % fp_table_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context)

    # Insert features and values
    insert_statement = []
    for fname_id, f_value in features_data:
        insert_statement.append({'fp_id': new_fp_id, 'feature_id': fname_id, 'value': f_value},)
    if insert_statement == []:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: empty insert_statement for %s inserts' % fp_table_name
        current_logger.error('%s' % fail_msg)
        # raise
    # else:
        # feature_count = sum(1 for x in a if isinstance(x, insert_statement))
        # current_logger.info(
        #     'fp_id - %s - %s feature values in insert_statement for %s ' %
        #     (str(feature_count), str(new_fp_id), fp_table_name))
        # feature_count = sum(1 for x in a if isinstance(x, insert_statement))
        # current_logger.info(
        #     'fp_id - %s - feature values in insert_statement for %s ' %
        #     (str(new_fp_id), fp_table_name))

    try:
        connection = engine.connect()
        connection.execute(fp_metric_table.insert(), insert_statement)
        connection.close()
        current_logger.info('create_features_profile :: fp_id - %s - feature values inserted into %s' % (str(new_fp_id), fp_table_name))
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to insert a feature values into %s' % fp_table_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context)

    # Create metric ts table if not exists ts_<metric_id>
    # Create z_ts_<metric_id> table
    # @modified 20170121 - Feature #1854: Ionosphere learn - generations
    # TODO Adding the option to not save timeseries to DB, as default?
    # ts_table_created = False
    ts_table_name = 'z_ts_%s' % str(metrics_id)
    try:
        ts_meta = MetaData()
        # @modified 20161222 - Task #1812: z_fp table type
        # Changed to InnoDB from MyISAM as no files open issues and MyISAM clean
        # up, there can be LOTS of file_per_table z_fp_ tables/files without
        # the MyISAM issues.  z_fp_ tables are mostly read and will be shuffled
        # in the table cache as required.
        ts_metric_table = Table(
            ts_table_name, ts_meta,
            Column('id', Integer, primary_key=True),
            Column('fp_id', Integer, nullable=False, key='fp_id'),
            Column('timestamp', Integer, nullable=False),
            Column('value', DOUBLE(), nullable=True),
            mysql_charset='utf8',
            mysql_key_block_size='255',
            mysql_engine='InnoDB')
        ts_metric_table.create(engine, checkfirst=True)
        # ts_table_created = True
        current_logger.info('create_features_profile :: metric ts table created OK - %s' % (ts_table_name))
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to create table - %s' % ts_table_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context)

    # Insert timeseries that the features profile was created from
    raw_timeseries = []
    anomaly_json = '%s/%s.json' % (metric_training_data_dir, base_name)
    if path.isfile(anomaly_json):
        current_logger.info('create_features_profile :: metric anomaly json found OK - %s' % (anomaly_json))
        try:
            # Read the timeseries json file
            with open(anomaly_json, 'r') as f:
                raw_timeseries = f.read()
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            fail_msg = 'error :: create_features_profile :: failed to read timeseries data from %s' % anomaly_json
            current_logger.error('%s' % (fail_msg))
            fail_msg = 'error: failed to read timeseries data from %s' % anomaly_json
            # end = timer()
            if context == 'training' or context == 'features_profile':
                # @added 20170806 - Bug #2130: MySQL - Aborted_clients
                # Added missing disposal
                if engine:
                    fp_create_engine_disposal(current_skyline_app, engine)
                # Raise to webbapp I believe to provide traceback to user in UI
                raise
    else:
        trace = 'none'
        fail_msg = 'error: file not found - %s' % (anomaly_json)
        current_logger.error(fail_msg)
        # raise

    # Convert the timeseries to csv
    timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']')
    timeseries = literal_eval(timeseries_array_str)

    datapoints = timeseries
    validated_timeseries = []
    for datapoint in datapoints:
        try:
            new_datapoint = [str(int(datapoint[0])), float(datapoint[1])]
            validated_timeseries.append(new_datapoint)
        # @modified 20170913 - Task #2160: Test skyline with bandit
        # Added nosec to exclude from bandit tests
        except:  # nosec
            continue

    insert_statement = []
    for ts, value in validated_timeseries:
        insert_statement.append({'fp_id': new_fp_id, 'timestamp': ts, 'value': value},)
    try:
        connection = engine.connect()
        connection.execute(ts_metric_table.insert(), insert_statement)
        connection.close()
        current_logger.info('create_features_profile :: fp_id - %s - timeseries inserted into %s' % (str(new_fp_id), ts_table_name))
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to insert the timeseries into %s' % ts_table_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            raise
        else:
            current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context)

    # Create a created features profile file
    try:
        # data = '[%s, %s, ]' % (new_fp_id, str(int(time.time())))
        # write_data_to_file(skyline_app, features_profile_created_file, 'w', data)
        # @modified 20170115 -  Feature #1854: Ionosphere learn - generations
        # Added parent_id and generation
        data = '[%s, %s, \'%s\', %s, %s, %s, %s, %s, %s]' % (
            new_fp_id, str(int(time.time())), str(tsfresh_version),
            str(calculated_time), str(fcount), str(fsum), str(ts_full_duration),
            str(fp_parent_id), str(fp_generation))
        write_data_to_file(current_skyline_app, features_profile_created_file, 'w', data)
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to write fp.created file'
        current_logger.error('%s' % fail_msg)

    # Set ionosphere_enabled for the metric
    try:
        # update_statement = 'UPDATE metrics SET ionosphere_enabled=1 WHERE id=%s' % str(metrics_id)
        connection = engine.connect()
        # result = connection.execute('UPDATE metrics SET ionosphere_enabled=1 WHERE id=%s' % str(metrics_id))
        # connection.execute(ts_metric_table.insert(), insert_statement)
        connection.execute(
            metrics_table.update(
                metrics_table.c.id == metrics_id).values(ionosphere_enabled=1))
        connection.close()
        current_logger.info('create_features_profile :: ionosphere_enabled set on metric id: %s' % str(metrics_id))
    except:
        trace = traceback.format_exc()
        current_logger.error(trace)
        fail_msg = 'error :: create_features_profile :: could not update metrics table and set ionosphere_enabled on id %s' % str(metrics_id)
        current_logger.error('%s' % fail_msg)
        # raise

    # Copy data from training data dir to features_profiles dir
    if not path.isdir(ts_features_profile_dir):
        mkdir_p(ts_features_profile_dir)

    if path.isdir(ts_features_profile_dir):
        current_logger.info('create_features_profile :: fp_id - %s - features profile dir created - %s' % (str(new_fp_id), ts_features_profile_dir))
        # src_files = os.listdir(src)
        # for file_name in src_files:
        #    full_file_name = path.join(src, file_name)
        #    if (path.isfile(full_file_name)):
        #        shutil.copy(full_file_name, dest)

        data_files = []
        try:
            glob_path = '%s/*.*' % metric_training_data_dir
            data_files = glob.glob(glob_path)
        except:
            trace = traceback.format_exc()
            current_logger.error('%s' % trace)
            current_logger.error('error :: create_features_profile :: glob - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir))

        for i_file in data_files:
            try:
                shutil.copy(i_file, ts_features_profile_dir)
                current_logger.info('create_features_profile :: fp_id - %s - training data copied - %s' % (str(new_fp_id), i_file))
            except shutil.Error as e:
                trace = traceback.format_exc()
                current_logger.error('%s' % trace)
                current_logger.error('error :: create_features_profile :: shutil error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir))
                current_logger.error('error :: create_features_profile :: %s' % (e))
            # Any error saying that the directory doesn't exist
            except OSError as e:
                trace = traceback.format_exc()
                current_logger.error('%s' % trace)
                current_logger.error('error :: create_features_profile :: OSError error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir))
                current_logger.error('error :: create_features_profile :: %s' % (e))
        current_logger.info('create_features_profile :: fp_id - %s - training data copied to %s' % (str(new_fp_id), ts_features_profile_dir))
    else:
        current_logger.error('error :: create_features_profile :: fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir))

    current_logger.info('create_features_profile :: disposing of any engine')
    try:
        if engine:
            fp_create_engine_disposal(current_skyline_app, engine)
        else:
            current_logger.info('create_features_profile :: no engine to dispose of' % (str(new_fp_id), ts_features_profile_dir))
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        current_logger.error('error :: create_features_profile :: OSError error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir))

    # @added 20170113 - Feature #1854: Ionosphere learn - Redis ionosphere.learn.work namespace
    # Ionosphere learn needs Redis works sets
    # When a features profile is created there needs to be work added to a Redis
    # set. When a human makes a features profile, we want Ionosphere to make a
    # use_full_duration_days features profile valid_learning_duration (e.g.
    # 3361) later.
    if settings.IONOSPHERE_LEARN and new_fp_id:
        create_redis_work_item = False
        if context == 'training_data' and ionosphere_job == 'learn_fp_human':
            create_redis_work_item = True
            # @modified 20170120 -  Feature #1854: Ionosphere learn - generations
            # Added fp_learn parameter to allow the user to not learn the
            # use_full_duration_days
            if not fp_learn:
                create_redis_work_item = False
                current_logger.info('fp_learn is False not adding an item to Redis ionosphere.learn.work set')

        if ionosphere_job == 'learn_fp_automatic':
            create_redis_work_item = True
            # @added 20170131 - Feature #1886 Ionosphere learn - child like parent with evolutionary maturity
            # TODO: here a check may be required to evaluate whether the origin_fp_id
            #       had a use_full_duration features profile created, however
            #       due to the fact that it is in learn, suggests that it did
            #       have, not 100% sure.
            origin_fp_id_was_allowed_to_learn = False
            child_use_full_duration_count_of_origin_fp_id = 1
            # TODO: Determine the state
            # child_use_full_duration_count_of_origin_fp_id = SELECT COUNT(id) FROM ionosphere WHERE parent_id=origin_fp_id AND full_duration=use_full_duration
            if child_use_full_duration_count_of_origin_fp_id == 0:
                current_logger.info('the origin parent was not allowed to learn not adding to Redis ionosphere.learn.work set')
                create_redis_work_item = False

        if create_redis_work_item:
            try:
                current_logger.info(
                    'adding work to Redis ionosphere.learn.work set - [\'Soft\', \'%s\', %s, \'%s\', %s, %s] to make a learn features profile later' % (
                        str(ionosphere_job), str(requested_timestamp), base_name,
                        str(new_fp_id), str(fp_generation)))
                redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                redis_conn.sadd('ionosphere.learn.work', ['Soft', str(ionosphere_job), int(requested_timestamp), base_name, int(new_fp_id), int(fp_generation)])
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: failed adding work to Redis ionosphere.learn.work set - [\'Soft\', \'%s\', %s, \'%s\', %s, %s] to make a learn features profile later' % (
                        str(ionosphere_job), str(requested_timestamp), base_name,
                        str(new_fp_id), str(fp_generation)))

        # @added 20170806 - Bug #2130: MySQL - Aborted_clients
        # Added missing disposal
        if engine:
            fp_create_engine_disposal(current_skyline_app, engine)

    return str(new_fp_id), True, False, fail_msg, trace
Пример #10
0
    def spin_process(self, i, run_timestamp):
        """
        Assign a metric for a process to analyze.
        """

        # Discover metric to analyze
        metric_var_files = [f for f in listdir(settings.MIRAGE_CHECK_PATH) if isfile(join(settings.MIRAGE_CHECK_PATH, f))]

        # Check if this process is unnecessary
        if len(metric_var_files) == 0:
            return

        metric_var_files_sorted = sorted(metric_var_files)
        metric_check_file = '%s/%s' % (
            settings.MIRAGE_CHECK_PATH, str(metric_var_files_sorted[0]))

        # Load metric variables
        self.load_metric_vars(metric_check_file)

        # Test metric variables
        if len(metric_vars.metric) == 0:
            return
        else:
            metric = metric_vars.metric
            metric_name = ['metric_name', metric_vars.metric]
            self.metric_variables.append(metric_name)
        if len(metric_vars.value) == 0:
            return
        else:
            metric_value = ['metric_value', metric_vars.value]
            self.metric_variables.append(metric_value)
        if len(metric_vars.hours_to_resolve) == 0:
            return
        else:
            hours_to_resolve = ['hours_to_resolve', metric_vars.hours_to_resolve]
            self.metric_variables.append(hours_to_resolve)
        if len(metric_vars.metric_timestamp) == 0:
            return
        else:
            metric_timestamp = ['metric_timestamp', metric_vars.metric_timestamp]
            self.metric_variables.append(metric_timestamp)

        # Ignore any metric check with a timestamp greater than 10 minutes ago
        int_metric_timestamp = int(metric_vars.metric_timestamp)
        int_run_timestamp = int(run_timestamp)
        metric_timestamp_age = int_run_timestamp - int_metric_timestamp
        if metric_timestamp_age > settings.MIRAGE_STALE_SECONDS:
            logger.info('stale check       :: %s check request is %s seconds old - discarding' % (metric_vars.metric, metric_timestamp_age))
            # Remove metric check file
#            try:
#                os.remove(metric_check_file)
#            except OSError:
#                pass
#            return
            if os.path.exists(metric_check_file):
                os.remove(metric_check_file)
                logger.info('removed %s' % (metric_check_file))
            else:
                logger.info('could not remove %s' % (metric_check_file))

        # Calculate hours second order resolution to seconds
        second_order_resolution_seconds = int(metric_vars.hours_to_resolve) * 3600

        # Calculate graphite from and until parameters from the metric timestamp
        graphite_until = datetime.datetime.fromtimestamp(int(metric_vars.metric_timestamp)).strftime('%H:%M_%Y%m%d')
        int_second_order_resolution_seconds = int(second_order_resolution_seconds)
        second_resolution_timestamp = int_metric_timestamp - int_second_order_resolution_seconds
        graphite_from = datetime.datetime.fromtimestamp(int(second_resolution_timestamp)).strftime('%H:%M_%Y%m%d')

        # Remove any old json file related to the metric
        metric_json_file = '%s/%s/%s.json' % (
            settings.MIRAGE_DATA_FOLDER, str(metric_vars.metric),
            str(metric_vars.metric))
        try:
            os.remove(metric_json_file)
        except OSError:
            pass

        # Get data from graphite
        logger.info(
            'retrieve data     :: surfacing %s timeseries from graphite for %s seconds' % (
                metric_vars.metric, second_order_resolution_seconds))
        self.surface_graphite_metric_data(metric_vars.metric, graphite_from, graphite_until)

        # Check there is a json timeseries file to test
        if not os.path.isfile(metric_json_file):
            logger.error(
                'error :: retrieve failed - failed to surface %s timeseries from graphite' % (
                    metric_vars.metric))
            # Remove metric check file
            try:
                os.remove(metric_check_file)
            except OSError:
                pass
            return
        else:
            logger.info('retrieved data    :: for %s at %s seconds' % (
                metric_vars.metric, second_order_resolution_seconds))

        # Make process-specific dicts
        exceptions = defaultdict(int)
        anomaly_breakdown = defaultdict(int)

        self.check_if_parent_is_alive()

        with open((metric_json_file), 'r') as f:
            timeseries = json.loads(f.read())
            logger.info('data points surfaced :: %s' % (len(timeseries)))

        try:
            logger.info('analyzing         :: %s at %s seconds' % (metric_vars.metric, second_order_resolution_seconds))
            anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_vars.metric, second_order_resolution_seconds)

            # If it's anomalous, add it to list
            if anomalous:
                base_name = metric.replace(settings.FULL_NAMESPACE, '', 1)
                anomalous_metric = [datapoint, base_name]
                self.anomalous_metrics.append(anomalous_metric)
                logger.info('anomaly detected  :: %s with %s' % (metric_vars.metric, metric_vars.value))
                # It runs so fast, this allows us to process 30 anomalies/min
                sleep(2)

                # Get the anomaly breakdown - who returned True?
                triggered_algorithms = []
                for index, value in enumerate(ensemble):
                    if value:
                        algorithm = settings.MIRAGE_ALGORITHMS[index]
                        anomaly_breakdown[algorithm] += 1
                        triggered_algorithms.append(algorithm)

                # If Crucible or Panorama are enabled determine details
                determine_anomaly_details = False
                if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED:
                    determine_anomaly_details = True
                if settings.PANORAMA_ENABLED:
                    determine_anomaly_details = True

                if determine_anomaly_details:
                    metric_timestamp = str(int(timeseries[-1][0]))
                    from_timestamp = str(int(timeseries[1][0]))
                    timeseries_dir = base_name.replace('.', '/')

                # If Panorama is enabled - create a Panorama check
                if settings.PANORAMA_ENABLED:
                    if not os.path.exists(settings.PANORAMA_CHECK_PATH):
                        if python_version == 2:
                            mode_arg = int('0755')
                        if python_version == 3:
                            mode_arg = mode=0o755
                        os.makedirs(settings.PANORAMA_CHECK_PATH, mode_arg)

                    # Note:
                    # The values are enclosed is single quoted intentionally
                    # as the imp.load_source used results in a shift in the
                    # decimal position when double quoted, e.g.
                    # value = "5622.0" gets imported as
                    # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
                    # single quoting results in the desired,
                    # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0
                    added_at = str(int(time()))
                    source = 'graphite'
                    panaroma_anomaly_data = 'metric = \'%s\'\n' \
                                            'value = \'%s\'\n' \
                                            'from_timestamp = \'%s\'\n' \
                                            'metric_timestamp = \'%s\'\n' \
                                            'algorithms = %s\n' \
                                            'triggered_algorithms = %s\n' \
                                            'app = \'%s\'\n' \
                                            'source = \'%s\'\n' \
                                            'added_by = \'%s\'\n' \
                                            'added_at = \'%s\'\n' \
                        % (base_name, str(datapoint), from_timestamp,
                           metric_timestamp, str(settings.MIRAGE_ALGORITHMS),
                           triggered_algorithms, skyline_app, source,
                           this_host, added_at)

                    # Create an anomaly file with details about the anomaly
                    panaroma_anomaly_file = '%s/%s.%s.txt' % (
                        settings.PANORAMA_CHECK_PATH, added_at,
                        base_name)
                    try:
                        write_data_to_file(
                            skyline_app, panaroma_anomaly_file, 'w',
                            panaroma_anomaly_data)
                        logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file))
                    except:
                        logger.error('error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file))
                        logger.info(traceback.format_exc())

                # If crucible is enabled - save timeseries and create a
                # crucible check
                if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED:
                    metric_timestamp = str(int(timeseries[-1][0]))
                    from_timestamp = str(int(timeseries[1][0]))
                    timeseries_dir = base_name.replace('.', '/')
                    crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp
                    if not os.path.exists(crucible_anomaly_dir):
                        if python_version == 2:
                            mode_arg = int('0755')
                        if python_version == 3:
                            mode_arg = mode=0o755
                        os.makedirs(crucible_anomaly_dir, mode_arg)

                    # Note:
                    # The value is enclosed is single quoted intentionally
                    # as the imp.load_source used in crucible results in a
                    # shift in the decimal position when double quoted, e.g.
                    # value = "5622.0" gets imported as
                    # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
                    # single quoting results in the desired,
                    # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0

                    crucible_anomaly_data = 'metric = \'%s\'\n' \
                                            'value = \'%s\'\n' \
                                            'from_timestamp = \'%s\'\n' \
                                            'metric_timestamp = \'%s\'\n' \
                                            'algorithms = %s\n' \
                                            'triggered_algorithms = %s\n' \
                                            'anomaly_dir = \'%s\'\n' \
                                            'graphite_metric = True\n' \
                                            'run_crucible_tests = False\n' \
                                            'added_by = \'%s\'\n' \
                                            'added_at = \'%s\'\n' \
                        % (base_name, str(datapoint), from_timestamp,
                           metric_timestamp, str(settings.MIRAGE_ALGORITHMS),
                           triggered_algorithms, crucible_anomaly_dir,
                           skyline_app, metric_timestamp)

                    # Create an anomaly file with details about the anomaly
                    crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, base_name)
                    try:
                        write_data_to_file(
                            skyline_app, crucible_anomaly_file, 'w',
                            crucible_anomaly_data)
                        logger.info('added crucible anomaly file :: %s' % (crucible_anomaly_file))
                    except:
                        logger.error('error :: failed to add crucible anomaly file :: %s' % (crucible_anomaly_file))
                        logger.info(traceback.format_exc())

                    # Create timeseries json file with the timeseries
                    json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name)
                    timeseries_json = str(timeseries).replace('[', '(').replace(']', ')')
                    try:
                        write_data_to_file(skyline_app, json_file, 'w', timeseries_json)
                        logger.info('added crucible timeseries file :: %s' % (json_file))
                    except:
                        logger.error('error :: failed to add crucible timeseries file :: %s' % (json_file))
                        logger.info(traceback.format_exc())

                    # Create a crucible check file
                    crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name)
                    try:
                        write_data_to_file(
                            skyline_app, crucible_check_file, 'w',
                            crucible_anomaly_data)
                        logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp))
                    except:
                        logger.error('error :: failed to add crucible check file :: %s' % (crucible_check_file))
                        logger.info(traceback.format_exc())
            else:
                base_name = metric.replace(settings.FULL_NAMESPACE, '', 1)
                not_anomalous_metric = [datapoint, base_name]
                self.not_anomalous_metrics.append(not_anomalous_metric)
                logger.info('not anomalous     :: %s with %s' % (metric_vars.metric, metric_vars.value))

        # It could have been deleted by the Roomba
        except TypeError:
            exceptions['DeletedByRoomba'] += 1
            logger.info('exceptions        :: DeletedByRoomba')
        except TooShort:
            exceptions['TooShort'] += 1
            logger.info('exceptions        :: TooShort')
        except Stale:
            exceptions['Stale'] += 1
            logger.info('exceptions        :: Stale')
        except Boring:
            exceptions['Boring'] += 1
            logger.info('exceptions        :: Boring')
        except:
            exceptions['Other'] += 1
            logger.info('exceptions        :: Other')
            logger.info(traceback.format_exc())

        # Add values to the queue so the parent process can collate
        for key, value in anomaly_breakdown.items():
            self.mirage_anomaly_breakdown_q.put((key, value))

        for key, value in exceptions.items():
            self.mirage_exceptions_q.put((key, value))

        # Remove metric check file
        try:
            os.remove(metric_check_file)
        except OSError:
            pass
Пример #11
0
def submit_crucible_job(from_timestamp, until_timestamp, metrics_list,
                        namespaces_list, source, alert_interval, user_id, user,
                        add_to_panorama, pad_timeseries, training_data_json,
                        run_algorithms):
    """
    Get a list of all the metrics passed and generate Crucible check files for
    each

    :param from_timestamp: the timestamp at which to start the time series
    :param until_timestamp: the timestamp at which to end the time series
    :param metrics_list: a list of metric names to analyse
    :param namespaces_list: a list of metric namespaces to analyse
    :param source: the source webapp making the request
    :param alert_interval: the alert_interval at which Crucible should trigger
        anomalies
    :param user_id: the user id of the user making the request
    :param user: the username making the request
    :param add_to_panorama: whether Crucible should add Skyline CONSENSUS
        anomalies to Panorama
    :param pad_timeseries: the amount of data to pad the time series with
    :param training_data_json: the full path to the training_data json file if
        source is training_data
    :param run_algorithms: list of algorithms to run
    :type from_timestamp: int
    :type until_timestamp: int
    :type metrics_list: list
    :type namespaces_list: list
    :type source: str
    :type alert_interval: int
    :type user_id: int
    :type user: str
    :type add_to_panorama: boolean
    :type pad_timeseries: str
    :type training_data_json: str
    :type run_algorithms: list
    :return: tuple of lists
    :rtype:  (list, list, list, list)

    Returns (crucible_job_id, metrics_submitted_to_process, fail_msg, trace)

    """

    fail_msg = None
    trace = None
    crucible_job_id = None
    metrics_submitted_to_process = 0

    # Generate a job id based on the YMDHMS.user_id and a job directory
    try:
        jobid_timestamp = int(time())
        jobid_datetimestamp = dt.datetime.fromtimestamp(
            jobid_timestamp).strftime('%Y%m%d%H%M%S')
        crucible_job_id = '%s.%s' % (str(jobid_datetimestamp), str(user_id))
    except:
        logger.error(traceback.format_exc())
        logger.error('error :: failed to determine a crucible_job_id')
        raise  # to webapp to return in the UI

    # Generate a job id based on the YMDHMS.user_id and a job directory
    try:
        crucible_path = os.path.dirname(settings.CRUCIBLE_DATA_FOLDER)
        crucible_job_dir = '%s/jobs/%s' % (crucible_path, crucible_job_id)
        if not path.exists(crucible_job_dir):
            logger.info('creating crucible job directory - %s' %
                        (str(crucible_job_dir)))
            mkdir_p(crucible_job_dir)
    except:
        trace = traceback.format_exc()
        fail_msg = 'error :: failed to create the crucible job directory'
        logger.error(trace)
        logger.error(fail_msg)
        raise  # to webapp to return in the UI

    # TODO added checks of metric names
    metric_names = []
    if metrics_list:
        logger.info('submit_crucible_job :: %s metrics passed' %
                    str(len(metrics_list)))
        for metric in metrics_list:
            metric_names.append(metric)

    # TODO added checks of metric namespaces, harder to do, but so that the UI
    # errors to the usr rather than sending a bad or non-existent metric to
    # Crucible
    if namespaces_list:
        logger.info('submit_crucible_job :: %s namespaces passed' %
                    str(len(namespaces_list)))
        logger.info(
            'submit_crucible_job :: determine metrics for submit_crucible_job between %s and %s'
            % (str(from_timestamp), str(until_timestamp)))
        logger.info('getting MySQL engine')
        try:
            engine, fail_msg, trace = get_an_engine()
            logger.info(fail_msg)
        except:
            trace = traceback.format_exc()
            logger.error(trace)
            logger.error('%s' % fail_msg)
            logger.error(
                'error :: could not get a MySQL engine to get metric names')
            raise  # to webapp to return in the UI

        if not engine:
            trace = 'none'
            fail_msg = 'error :: engine not obtained'
            logger.error(fail_msg)
            raise

        try:
            metrics_table, log_msg, trace = metrics_table_meta(
                skyline_app, engine)
            logger.info(log_msg)
            logger.info('metrics_table OK')
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to get metrics_table meta')
            if engine:
                engine_disposal(engine)
            raise  # to webapp to return in the UI

        metrics_like_query = text(
            """SELECT metric FROM metrics WHERE metric LIKE :like_string""")
        for namespace in namespaces_list:
            try:
                connection = engine.connect()
                results = connection.execute(metrics_like_query,
                                             like_string=str(namespace))
                connection.close()
                for row in results:
                    metric_name = str(row[0])
                    metric_names.append(metric_name)
            except:
                trace = traceback.format_exc()
                logger.error(trace)
                logger.error(
                    'error :: could not determine metrics from metrics table')
                if engine:
                    engine_disposal(engine)
                raise
        logger.info(
            'submit_crucible_job :: %s metrics determined from passed namespaces'
            % str(len(metric_names)))

    logger.info('submit_crucible_job :: %s metrics to process' %
                str(len(metric_names)))
    metrics_submitted_to_process = []
    datapoint = 0
    triggered_algorithms = [
        'histogram_bins', 'first_hour_average', 'stddev_from_average',
        'grubbs', 'ks_test', 'mean_subtraction_cumulation',
        'median_absolute_deviation', 'stddev_from_moving_average',
        'least_squares'
    ]
    added_at = int(time())
    for base_name in metric_names:
        sane_metricname = filesafe_metricname(str(base_name))
        derivative_metric = is_derivative_metric(skyline_app, base_name)
        if derivative_metric:
            target = 'nonNegativeDerivative(%s)' % base_name
        else:
            target = base_name
        # Generate a metric job directory
        crucible_anomaly_dir = '%s/%s' % (crucible_job_dir, sane_metricname)
        try:
            if not path.exists(crucible_anomaly_dir):
                logger.info('creating crucible metric job directory - %s' %
                            (str(crucible_anomaly_dir)))
                mkdir_p(crucible_anomaly_dir)
        except:
            trace = traceback.format_exc()
            fail_msg = 'error :: failed to create the crucible metric job directory'
            logger.error(trace)
            logger.error(fail_msg)
            raise  # to webapp to return in the UI
        if source == 'graphite':
            graphite_metric = True
        else:
            graphite_metric = False

        # @added 20200422 - Feature #3500: webapp - crucible_process_metrics
        #                   Feature #1448: Crucible web UI
        # In order for metrics to be analysed in Crucible like the Analyzer or
        # Mirage analysis, the time series data needs to be padded
        # Added pad_timeseries
        graphite_override_uri_parameters = 'from=%s&until=%s&target=%s' % (
            str(from_timestamp), str(until_timestamp), target)
        timeseries_full_duration = int(until_timestamp) - int(from_timestamp)
        pad_timeseries_with = 0
        if pad_timeseries == 'auto':
            if timeseries_full_duration > 3600:
                pad_timeseries_with = 3600
            if timeseries_full_duration > 86400:
                pad_timeseries_with = 86400
        if pad_timeseries == '86400':
            pad_timeseries_with = 86400
        if pad_timeseries == '604800':
            pad_timeseries_with = 604800
        if pad_timeseries == '0':
            pad_timeseries_with = 0
        if pad_timeseries_with:
            try:
                padded_from_timestamp = int(
                    from_timestamp) - pad_timeseries_with
                graphite_override_uri_parameters = 'from=%s&until=%s&target=%s' % (
                    str(padded_from_timestamp), str(until_timestamp), target)
                logger.info('padding time series with %s seconds - %s' %
                            (str(pad_timeseries_with),
                             str(graphite_override_uri_parameters)))
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: failed to construct graphite_override_uri_parameters with pad_timeseries_with %s'
                    % str(pad_timeseries_with))

        # @added 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms
        # Allow the user to pass algorithms to run
        algorithms = settings.ALGORITHMS
        if run_algorithms:
            algorithms = run_algorithms

        # @modified 20200421 - Feature #3500: webapp - crucible_process_metrics
        #                      Feature #1448: Crucible web UI
        # Added add_to_panorama
        # @added 20200607 - Feature #3630: webapp - crucible_process_training_data
        # Added training_data_json
        crucible_anomaly_data = 'metric = \'%s\'\n' \
                                'value = \'%s\'\n' \
                                'from_timestamp = \'%s\'\n' \
                                'metric_timestamp = \'%s\'\n' \
                                'algorithms = %s\n' \
                                'triggered_algorithms = %s\n' \
                                'anomaly_dir = \'%s\'\n' \
                                'graphite_metric = %s\n' \
                                'run_crucible_tests = True\n' \
                                'added_by = \'%s\'\n' \
                                'added_at = \'%s\'\n' \
                                'graphite_override_uri_parameters = \'%s\'\n' \
                                'alert_interval = \'%s\'\n' \
                                'add_to_panorama = %s\n' \
                                'training_data_json = %s\n' \
            % (base_name, str(datapoint), str(from_timestamp),
               # @modified 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms
               # str(until_timestamp), str(settings.ALGORITHMS),
               str(until_timestamp), str(algorithms),
               triggered_algorithms, crucible_anomaly_dir, str(graphite_metric),
               skyline_app, str(added_at), str(graphite_override_uri_parameters),
               str(alert_interval), str(add_to_panorama), str(training_data_json))

        # Create an anomaly file with details about the anomaly
        crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir,
                                               sane_metricname)
        try:
            write_data_to_file(skyline_app, crucible_anomaly_file, 'w',
                               crucible_anomaly_data)
            logger.info('added crucible anomaly file :: %s' %
                        (crucible_anomaly_file))
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to add crucible anomaly file :: %s' %
                         (crucible_anomaly_file))
        # Create a crucible check file
        crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH,
                                                str(added_at), sane_metricname)
        try:
            write_data_to_file(skyline_app, crucible_check_file, 'w',
                               crucible_anomaly_data)
            logger.info('added crucible check :: %s,%s' %
                        (base_name, str(added_at)))
            metrics_submitted_to_process.append(base_name)
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to add crucible check file :: %s' %
                         (crucible_check_file))

    return (crucible_job_id, metrics_submitted_to_process, fail_msg, trace)
Пример #12
0
def send_crucible_job_metric_to_panorama(crucible_job_id, base_name, user_id,
                                         user, skyline_consensus_anomalies):
    """
    Send the Crucible Skyline CONSENSUS anomalies for a crucible_job and metric
    to Panorama to insert into the anomalies database.

    :param crucible_job_id: the crucible_job_id
    :param base_name: the metric name
    :param user_id: the user_id
    :param user: the username
    :param skyline_consensus_anomalies: the Crucible Skyline CONSENSUS anomalies
    :type crucible_job_id: str
    :type base_name: str
    :type user_id: int
    :type user: str
    :type skyline_consensus_anomalies: list

    :return: tuple of lists
    :rtype:  (int, str, str)

    Returns (len(skyline_consensus_anomalies), fail_msg, trace)

    """

    fail_msg = None
    trace = None
    added_at = int(time())
    crucible_path = os.path.dirname(settings.CRUCIBLE_DATA_FOLDER)
    jobs_data_dir = '%s/jobs' % crucible_path
    data_dir = '%s/%s/%s' % (jobs_data_dir, crucible_job_id, base_name)
    crucible_job_details_filename = '%s.txt' % base_name
    crucible_job_details_file = '%s/%s' % (data_dir,
                                           crucible_job_details_filename)
    crucible_job_details = []
    try:
        logger.info(
            'send_crucible_job_metric_to_panorama :: getting crucible_job_details from file - %s'
            % (crucible_job_details_file))
        with open(crucible_job_details_file) as f:
            for line in f:
                no_new_line = line.replace('\n', '')
                no_equal_line = no_new_line.replace(' = ', ',')
                array = str(no_equal_line.split(',', 1))
                add_line = literal_eval(array)
                crucible_job_details.append(add_line)
    except:
        trace = traceback.format_exc()
        fail_msg = 'error :: send_crucible_job_metric_to_panorama - failed to get crucible_job_details from file - %s' % crucible_job_details_file
        logger.error(trace)
        logger.error(fail_msg)
        raise  # to webapp to return in the UI
    try:
        timestamp_str = str(crucible_job_details[2][1])
        new_timestamp_str = timestamp_str.replace("'", "")
        from_timestamp = int(new_timestamp_str)
    except:
        trace = traceback.format_exc()
        fail_msg = 'error :: send_crucible_job_metric_to_panorama - failed to determine from_timestamp from get crucible_job_details'
        logger.error(trace)
        logger.error(fail_msg)
        raise  # to webapp to return in the UI

    label = 'Crucible job %s' % str(crucible_job_id)
    sane_metricname = filesafe_metricname(str(base_name))

    # skyline_consensus_anomalies format
    # [timestamp, value, anomaly_score, triggered_algorithms]
    # [skyline_anomaly[0], skyline_anomaly[1], skyline_anomaly[2], skyline_anomaly[3]]
    # [1583234400, 44.39999999990687, 2, ['histogram_bins', 'median_absolute_deviation']],
    for timestamp, datapoint, anomaly_score, triggered_algorithms in skyline_consensus_anomalies:
        # To allow multiple Panorama anomaly files to added quickly just
        # increment the added_at by 1 seconds so that all the files have a
        # unique name
        added_at += 1
        # Note:
        # The values are enclosed is single quoted intentionally
        # as the imp.load_source used results in a shift in the
        # decimal position when double quoted, e.g.
        # value = "5622.0" gets imported as
        # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
        # single quoting results in the desired,
        # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0
        source = 'graphite'
        panaroma_anomaly_data = 'metric = \'%s\'\n' \
                                'value = \'%s\'\n' \
                                'from_timestamp = \'%s\'\n' \
                                'metric_timestamp = \'%s\'\n' \
                                'algorithms = %s\n' \
                                'triggered_algorithms = %s\n' \
                                'app = \'%s\'\n' \
                                'source = \'%s\'\n' \
                                'added_by = \'%s\'\n' \
                                'added_at = \'%s\'\n' \
                                'label = \'%s\'\n' \
                                'user_id = \'%s\'\n' \
            % (base_name, str(datapoint), from_timestamp,
               str(timestamp), str(settings.ALGORITHMS),
               triggered_algorithms, skyline_app, source,
               this_host, str(added_at), label, str(user_id))
        # Create an anomaly file with details about the anomaly
        panaroma_anomaly_file = '%s/%s.%s.txt' % (settings.PANORAMA_CHECK_PATH,
                                                  added_at, sane_metricname)
        try:
            write_data_to_file(skyline_app, panaroma_anomaly_file, 'w',
                               panaroma_anomaly_data)
            logger.info(
                'send_crucible_job_metric_to_panorama - added panorama anomaly file :: %s'
                % (panaroma_anomaly_file))
        except:
            logger.error(traceback.format_exc())
            logger.error(
                'error :: send_crucible_job_metric_to_panorama - failed to add panorama anomaly file :: %s'
                % (panaroma_anomaly_file))

    crucible_job_sent_to_panorama_file = '%s/%s.%s.%s.sent_to_panorama.txt' % (
        data_dir, str(added_at), crucible_job_id, base_name)
    panorama_done_data = [added_at, int(user_id), skyline_consensus_anomalies]
    try:
        write_data_to_file(skyline_app, crucible_job_sent_to_panorama_file,
                           'w', str(panorama_done_data))
        logger.info(
            'send_crucible_job_metric_to_panorama - added set to panorama crucible job file :: %s'
            % (crucible_job_sent_to_panorama_file))
        logger.info(
            'send_crucible_job_metric_to_panorama - with contents :: %s' %
            (panorama_done_data))
    except:
        logger.error(traceback.format_exc())
        logger.error(
            'error :: send_crucible_job_metric_to_panorama - failed to add panorama crucible job file :: %s'
            % (crucible_job_sent_to_panorama_file))

    return (len(skyline_consensus_anomalies), fail_msg, trace)
Пример #13
0
    def spin_process(self, i, boundary_metrics):
        """
        Assign a bunch of metrics for a process to analyze.
        """
        # Determine assigned metrics
        bp = settings.BOUNDARY_PROCESSES
        bm_range = len(boundary_metrics)
        keys_per_processor = int(ceil(float(bm_range) / float(bp)))
        if i == settings.BOUNDARY_PROCESSES:
            assigned_max = len(boundary_metrics)
        else:
            # This is a skyine bug, the original skyline code uses 1 as the
            # beginning position of the index, python indices begin with 0
            # assigned_max = len(boundary_metrics)
            # This closes the etsy/skyline pull request opened by @languitar on 17 Jun 2014
            # https://github.com/etsy/skyline/pull/94 Fix analyzer worker metric assignment
            assigned_max = min(len(boundary_metrics), i * keys_per_processor)
        assigned_min = (i - 1) * keys_per_processor
        assigned_keys = range(assigned_min, assigned_max)

        # Compile assigned metrics
        assigned_metrics_and_algos = [boundary_metrics[index] for index in assigned_keys]
        if ENABLE_BOUNDARY_DEBUG:
            logger.info('debug :: printing assigned_metrics_and_algos')
            for assigned_metric_and_algo in assigned_metrics_and_algos:
                logger.info('debug :: assigned_metric_and_algo - %s' % str(assigned_metric_and_algo))

        # Compile assigned metrics
        assigned_metrics = []
        for i in assigned_metrics_and_algos:
            assigned_metrics.append(i[0])

        # unique unhashed things
        def unique_noHash(seq):
            seen = set()
            return [x for x in seq if str(x) not in seen and not seen.add(str(x))]

        unique_assigned_metrics = unique_noHash(assigned_metrics)

        if ENABLE_BOUNDARY_DEBUG:
            logger.info('debug :: unique_assigned_metrics - %s' % str(unique_assigned_metrics))
            logger.info('debug :: printing unique_assigned_metrics:')
            for unique_assigned_metric in unique_assigned_metrics:
                logger.info('debug :: unique_assigned_metric - %s' % str(unique_assigned_metric))

        # Check if this process is unnecessary
        if len(unique_assigned_metrics) == 0:
            return

        # Multi get series
        try:
            raw_assigned = self.redis_conn.mget(unique_assigned_metrics)
        except:
            logger.error('error :: failed to mget assigned_metrics from redis')
            return

        # Make process-specific dicts
        exceptions = defaultdict(int)
        anomaly_breakdown = defaultdict(int)

        # Reset boundary_algortims
        all_boundary_algorithms = []
        for metric in BOUNDARY_METRICS:
            all_boundary_algorithms.append(metric[1])

        # The unique algorithms that are being used
        boundary_algorithms = unique_noHash(all_boundary_algorithms)
        if ENABLE_BOUNDARY_DEBUG:
            logger.info('debug :: boundary_algorithms - %s' % str(boundary_algorithms))

        discover_run_metrics = []

        # Distill metrics into a run list
        for i, metric_name, in enumerate(unique_assigned_metrics):
            self.check_if_parent_is_alive()

            try:
                if ENABLE_BOUNDARY_DEBUG:
                    logger.info('debug :: unpacking timeseries for %s - %s' % (metric_name, str(i)))
                raw_series = raw_assigned[i]
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
            except Exception as e:
                exceptions['Other'] += 1
                logger.error('error :: redis data error: ' + traceback.format_exc())
                logger.error('error :: %e' % e)

            base_name = metric_name.replace(FULL_NAMESPACE, '', 1)

            # Determine the metrics BOUNDARY_METRICS metric tuple settings
            for metrick in BOUNDARY_METRICS:
                CHECK_MATCH_PATTERN = metrick[0]
                check_match_pattern = re.compile(CHECK_MATCH_PATTERN)
                pattern_match = check_match_pattern.match(base_name)
                metric_pattern_matched = False
                if pattern_match:
                    metric_pattern_matched = True
                    algo_pattern_matched = False
                    for algo in boundary_algorithms:
                        for metric in BOUNDARY_METRICS:
                            CHECK_MATCH_PATTERN = metric[0]
                            check_match_pattern = re.compile(CHECK_MATCH_PATTERN)
                            pattern_match = check_match_pattern.match(base_name)
                            if pattern_match:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug :: metric and algo pattern MATCHED - " + metric[0] + " | " + base_name + " | " + str(metric[1]))
                                metric_expiration_time = False
                                metric_min_average = False
                                metric_min_average_seconds = False
                                metric_trigger = False
                                algorithm = False
                                algo_pattern_matched = True
                                algorithm = metric[1]
                                try:
                                    if metric[2]:
                                        metric_expiration_time = metric[2]
                                except:
                                    metric_expiration_time = False
                                try:
                                    if metric[3]:
                                        metric_min_average = metric[3]
                                except:
                                    metric_min_average = False
                                try:
                                    if metric[4]:
                                        metric_min_average_seconds = metric[4]
                                except:
                                    metric_min_average_seconds = 1200
                                try:
                                    if metric[5]:
                                        metric_trigger = metric[5]
                                except:
                                    metric_trigger = False
                                try:
                                    if metric[6]:
                                        alert_threshold = metric[6]
                                except:
                                    alert_threshold = False
                                try:
                                    if metric[7]:
                                        metric_alerters = metric[7]
                                except:
                                    metric_alerters = False
                            if metric_pattern_matched and algo_pattern_matched:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info('debug :: added metric - %s, %s, %s, %s, %s, %s, %s, %s, %s' % (str(i), metric_name, str(metric_expiration_time), str(metric_min_average), str(metric_min_average_seconds), str(metric_trigger), str(alert_threshold), metric_alerters, algorithm))
                                discover_run_metrics.append([i, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm])

        if ENABLE_BOUNDARY_DEBUG:
            logger.info('debug :: printing discover_run_metrics')
            for discover_run_metric in discover_run_metrics:
                logger.info('debug :: discover_run_metrics - %s' % str(discover_run_metric))
            logger.info('debug :: build unique boundary metrics to analyze')

        # Determine the unique set of metrics to run
        run_metrics = unique_noHash(discover_run_metrics)

        if ENABLE_BOUNDARY_DEBUG:
            logger.info('debug :: printing run_metrics')
            for run_metric in run_metrics:
                logger.info('debug :: run_metrics - %s' % str(run_metric))

        # Distill timeseries strings and submit to run_selected_algorithm
        for metric_and_algo in run_metrics:
            self.check_if_parent_is_alive()

            try:
                raw_assigned_id = metric_and_algo[0]
                metric_name = metric_and_algo[1]
                base_name = metric_name.replace(FULL_NAMESPACE, '', 1)
                metric_expiration_time = metric_and_algo[2]
                metric_min_average = metric_and_algo[3]
                metric_min_average_seconds = metric_and_algo[4]
                metric_trigger = metric_and_algo[5]
                alert_threshold = metric_and_algo[6]
                metric_alerters = metric_and_algo[7]
                algorithm = metric_and_algo[8]

                if ENABLE_BOUNDARY_DEBUG:
                    logger.info('debug :: unpacking timeseries for %s - %s' % (metric_name, str(raw_assigned_id)))

                raw_series = raw_assigned[metric_and_algo[0]]
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)

                if ENABLE_BOUNDARY_DEBUG:
                    logger.info('debug :: unpacked OK - %s - %s' % (metric_name, str(raw_assigned_id)))

                autoaggregate = False
                autoaggregate_value = 0

                # Determine if the namespace is to be aggregated
                if BOUNDARY_AUTOAGGRERATION:
                    for autoaggregate_metric in BOUNDARY_AUTOAGGRERATION_METRICS:
                        autoaggregate = False
                        autoaggregate_value = 0
                        CHECK_MATCH_PATTERN = autoaggregate_metric[0]
                        base_name = metric_name.replace(FULL_NAMESPACE, '', 1)
                        check_match_pattern = re.compile(CHECK_MATCH_PATTERN)
                        pattern_match = check_match_pattern.match(base_name)
                        if pattern_match:
                            autoaggregate = True
                            autoaggregate_value = autoaggregate_metric[1]

                if ENABLE_BOUNDARY_DEBUG:
                    logger.info('debug :: BOUNDARY_AUTOAGGRERATION passed - %s - %s' % (metric_name, str(autoaggregate)))

                if ENABLE_BOUNDARY_DEBUG:
                    logger.info(
                        'debug :: analysing - %s, %s, %s, %s, %s, %s, %s, %s, %s, %s' % (
                            metric_name, str(metric_expiration_time),
                            str(metric_min_average),
                            str(metric_min_average_seconds),
                            str(metric_trigger), str(alert_threshold),
                            metric_alerters, autoaggregate,
                            autoaggregate_value, algorithm)
                    )
                    # Dump the the timeseries data to a file
                    timeseries_dump_dir = "/tmp/skyline/boundary/" + algorithm
                    self.mkdir_p(timeseries_dump_dir)
                    timeseries_dump_file = timeseries_dump_dir + "/" + metric_name + ".json"
                    with open(timeseries_dump_file, 'w+') as f:
                        f.write(str(timeseries))
                        f.close()

                # Check if a metric has its own unique BOUNDARY_METRICS alert
                # tuple, this allows us to paint an entire metric namespace with
                # the same brush AND paint a unique metric or namespace with a
                # different brush or scapel
                has_unique_tuple = False
                run_tupple = False
                boundary_metric_tuple = (base_name, algorithm, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters)
                wildcard_namespace = True
                for metric_tuple in BOUNDARY_METRICS:
                    if not has_unique_tuple:
                        CHECK_MATCH_PATTERN = metric_tuple[0]
                        check_match_pattern = re.compile(CHECK_MATCH_PATTERN)
                        pattern_match = check_match_pattern.match(base_name)
                        if pattern_match:
                            if metric_tuple[0] == base_name:
                                wildcard_namespace = False
                            if not has_unique_tuple:
                                if boundary_metric_tuple == metric_tuple:
                                    has_unique_tuple = True
                                    run_tupple = True
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info('unique_tuple:')
                                        logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple))
                                        logger.info('metric_tuple: %s' % str(metric_tuple))

                if not has_unique_tuple:
                    if wildcard_namespace:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info('wildcard_namespace:')
                            logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple))
                        run_tupple = True
                    else:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info('wildcard_namespace: BUT WOULD NOT RUN')
                            logger.info('boundary_metric_tuple: %s' % str(boundary_metric_tuple))

                if ENABLE_BOUNDARY_DEBUG:
                    logger.info('WOULD RUN run_selected_algorithm = %s' % run_tupple)

                if run_tupple:
                    # Submit the timeseries and settings to run_selected_algorithm
                    anomalous, ensemble, datapoint, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm = run_selected_algorithm(
                        timeseries, metric_name,
                        metric_expiration_time,
                        metric_min_average,
                        metric_min_average_seconds,
                        metric_trigger,
                        alert_threshold,
                        metric_alerters,
                        autoaggregate,
                        autoaggregate_value,
                        algorithm
                    )
                    if ENABLE_BOUNDARY_DEBUG:
                        logger.info('debug :: analysed - %s' % (metric_name))
                else:
                    anomalous = False
                    if ENABLE_BOUNDARY_DEBUG:
                        logger.info('debug :: more unique metric tuple not analysed - %s' % (metric_name))

                # If it's anomalous, add it to list
                if anomalous:
                    anomalous_metric = [datapoint, metric_name, metric_expiration_time, metric_min_average, metric_min_average_seconds, metric_trigger, alert_threshold, metric_alerters, algorithm]
                    self.anomalous_metrics.append(anomalous_metric)
                    # Get the anomaly breakdown - who returned True?
                    triggered_algorithms = []
                    for index, value in enumerate(ensemble):
                        if value:
                            anomaly_breakdown[algorithm] += 1
                            triggered_algorithms.append(algorithm)

                    # If Crucible or Panorama are enabled determine details
                    determine_anomaly_details = False
                    if settings.ENABLE_CRUCIBLE and settings.BOUNDARY_CRUCIBLE_ENABLED:
                        determine_anomaly_details = True
                    if settings.PANORAMA_ENABLED:
                        determine_anomaly_details = True

                    if determine_anomaly_details:
                        metric_timestamp = str(int(timeseries[-1][0]))
                        from_timestamp = str(int(timeseries[1][0]))
                        timeseries_dir = base_name.replace('.', '/')

                    # If Panorama is enabled - create a Panorama check
                    if settings.PANORAMA_ENABLED:
                        # Note:
                        # The values are enclosed is single quoted intentionally
                        # as the imp.load_source used results in a shift in the
                        # decimal position when double quoted, e.g.
                        # value = "5622.0" gets imported as
                        # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
                        # single quoting results in the desired,
                        # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0
                        added_at = str(int(time()))
                        source = 'graphite'
                        panaroma_anomaly_data = 'metric = \'%s\'\n' \
                                                'value = \'%s\'\n' \
                                                'from_timestamp = \'%s\'\n' \
                                                'metric_timestamp = \'%s\'\n' \
                                                'algorithms = [\'%s\']\n' \
                                                'triggered_algorithms = [\'%s\']\n' \
                                                'app = \'%s\'\n' \
                                                'source = \'%s\'\n' \
                                                'added_by = \'%s\'\n' \
                                                'added_at = \'%s\'\n' \
                            % (base_name, str(datapoint), from_timestamp,
                               metric_timestamp, str(algorithm), str(algorithm),
                               skyline_app, source, this_host, added_at)

                        # Create an anomaly file with details about the anomaly
                        panaroma_anomaly_file = '%s/%s.%s.txt' % (
                            settings.PANORAMA_CHECK_PATH, added_at,
                            base_name)
                        try:
                            write_data_to_file(
                                skyline_app, panaroma_anomaly_file, 'w',
                                panaroma_anomaly_data)
                            logger.info('added panorama anomaly file :: %s' % (panaroma_anomaly_file))
                        except:
                            logger.error('error :: failed to add panorama anomaly file :: %s' % (panaroma_anomaly_file))
                            logger.info(traceback.format_exc())

                    # If crucible is enabled - save timeseries and create a
                    # crucible check
                    if settings.ENABLE_CRUCIBLE and settings.BOUNDARY_CRUCIBLE_ENABLED:
                        crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp
                        if not os.path.exists(crucible_anomaly_dir):
                            if python_version == 2:
                                mode_arg = int('0755')
                            if python_version == 3:
                                mode_arg = mode=0o755
                            os.makedirs(crucible_anomaly_dir, mode_arg)

                        # Note:
                        # Due to only one algorithm triggering here the
                        # algorithm related arrays here are a different format
                        # to there output format in analyzer

                        # Note:
                        # The value is enclosed is single quoted intentionally
                        # as the imp.load_source used in crucible results in a
                        # shift in the decimal position when double quoted, e.g.
                        # value = "5622.0" gets imported as
                        # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
                        # single quoting results in the desired,
                        # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0

                        crucible_anomaly_data = 'metric = \'%s\'\n' \
                                                'value = \'%s\'\n' \
                                                'from_timestamp = \'%s\'\n' \
                                                'metric_timestamp = \'%s\'\n' \
                                                'algorithms = %s\n' \
                                                'triggered_algorithms = %s\n' \
                                                'anomaly_dir = \'%s\'\n' \
                                                'graphite_metric = True\n' \
                                                'run_crucible_tests = False\n' \
                                                'added_by = \'%s\'\n' \
                                                'added_at = \'%s\'\n' \
                            % (base_name, str(datapoint), from_timestamp,
                               metric_timestamp, str(algorithm),
                               triggered_algorithms, crucible_anomaly_dir,
                               skyline_app, metric_timestamp)

                        # Create an anomaly file with details about the anomaly
                        crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir, base_name)
                        with open(crucible_anomaly_file, 'w') as fh:
                            fh.write(crucible_anomaly_data)
                        if python_version == 2:
                            mode_arg = int('0644')
                        if python_version == 3:
                            mode_arg = '0o644'
                        os.chmod(crucible_anomaly_file, mode_arg)
                        logger.info('added crucible anomaly file :: %s/%s.txt' % (crucible_anomaly_dir, base_name))

                        # Create timeseries json file with the timeseries
                        json_file = '%s/%s.json' % (crucible_anomaly_dir, base_name)
                        timeseries_json = str(timeseries).replace('[', '(').replace(']', ')')
                        with open(json_file, 'w') as fh:
                            # timeseries
                            fh.write(timeseries_json)
                        if python_version == 2:
                            mode_arg = int('0644')
                        if python_version == 3:
                            mode_arg = '0o644'
                        os.chmod(json_file, mode_arg)
                        logger.info('added crucible timeseries file :: %s/%s.json' % (crucible_anomaly_dir, base_name))

                        # Create a crucible check file
                        crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH, metric_timestamp, base_name)
                        with open(crucible_check_file, 'w') as fh:
                            fh.write(crucible_anomaly_data)
                        if python_version == 2:
                            mode_arg = int('0644')
                        if python_version == 3:
                            mode_arg = '0o644'
                        os.chmod(crucible_check_file, mode_arg)
                        logger.info('added crucible check :: %s,%s' % (base_name, metric_timestamp))

            # It could have been deleted by the Roomba
            except TypeError:
                exceptions['DeletedByRoomba'] += 1
            except TooShort:
                exceptions['TooShort'] += 1
            except Stale:
                exceptions['Stale'] += 1
            except Boring:
                exceptions['Boring'] += 1
            except:
                exceptions['Other'] += 1
                logger.info("exceptions['Other'] traceback follows:")
                logger.info(traceback.format_exc())

        # Add values to the queue so the parent process can collate
        for key, value in anomaly_breakdown.items():
            self.anomaly_breakdown_q.put((key, value))

        for key, value in exceptions.items():
            self.exceptions_q.put((key, value))
Пример #14
0
def on_demand_motif_analysis(metric, timestamp, similarity, batch_size,
                             top_matches, max_distance, range_padding,
                             max_area_percent_diff):
    """
    Process a motif similarity search on demand
    """
    import numpy as np
    import mass_ts as mts

    logger = logging.getLogger(skyline_app_logger)
    dev_null = None
    function_str = 'on_demand_motif_analysis'
    logger.info(
        '%s :: with parameters :: metric: %s, timestamp: %s, similarity: %s, batch_size:%s, top_matches: %s, max_distance: %s, range_padding: %s, max_area_percent_diff: %s'
        % (function_str, str(metric), str(timestamp), str(similarity),
           str(batch_size), str(top_matches), str(max_distance),
           str(range_padding), str(max_area_percent_diff)))
    trace = 'none'
    fail_msg = 'none'

    start = time.time()
    start_timer = timer()
    metric_vars_dict = {}
    metric_id = 0
    fp_ids = []
    timeseries = []
    not_similar_enough_sample = 0
    not_similar_motifs = 0
    similar_motifs = 0
    exact_motifs = 0
    distance_motifs = 0
    motifs_found = []
    find_exact_matches_run = False
    exact_matches_found = []
    fps_timeseries = {}
    # A motif_analysis dict to add to and return
    motif_analysis = {}
    motif_analysis[metric] = {}
    motif_analysis[metric]['timestamp'] = int(timestamp)
    motif_analysis[metric]['started'] = start
    motif_analysis[metric]['motifs'] = {}
    motif_analysis[metric]['exact_motifs'] = exact_motifs
    motif_analysis[metric]['similar_motifs'] = similar_motifs
    motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs
    motif_analysis[metric][
        'not_similar_enough_sample'] = not_similar_enough_sample
    # @added 20210417 - Feature #4014: Ionosphere - inference
    # Allow the user to define the batch_size per similarity search
    motif_analysis[metric]['batch_size'] = int(batch_size)
    motif_analysis[metric]['top_matches'] = int(top_matches)
    motif_analysis[metric]['max_distance'] = float(max_distance)
    # @added 20210425 - Feature #4014: Ionosphere - inference
    # Added max_area_percent_diff for computing the area under the curve
    motif_analysis[metric]['max_area_percent_diff'] = float(
        max_area_percent_diff)

    fps_checked_for_motifs = []

    metric_dir = metric.replace('.', '/')
    metric_timeseries_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER,
                                          str(timestamp), metric_dir)

    # @added 20210418 - Feature #4014: Ionosphere - inference
    # Allow for the similarity search on saved_training_data
    if 'saved_training_data' in request.args:
        saved_training_data_str = request.args.get('saved_training_data',
                                                   'false')
        if saved_training_data_str == 'true':
            saved_metric_timeseries_dir = '%s_saved/%s/%s' % (
                settings.IONOSPHERE_DATA_FOLDER, str(timestamp), metric_dir)
            if path.exists(saved_metric_timeseries_dir):
                metric_timeseries_dir = saved_metric_timeseries_dir
                logger.info('%s :: using saved training_data dir - %s' %
                            (function_str, saved_metric_timeseries_dir))

    metric_vars_file = '%s/%s.txt' % (metric_timeseries_dir, metric)
    timeseries_json = '%s/%s.json' % (metric_timeseries_dir, metric)
    full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60)
    full_duration_timeseries_json = '%s/%s.mirage.redis.%sh.json' % (
        metric_timeseries_dir, metric, str(full_duration_in_hours))
    try:
        metric_vars_dict = mirage_load_metric_vars(skyline_app,
                                                   metric_vars_file, True)
    except Exception as e:
        logger.error(
            'error :: inference :: failed to load metric variables from check file - %s - %s'
            % (metric_vars_file, e))
    if not metric_vars_dict:
        motif_analysis[metric]['status'] = 'error'
        motif_analysis[metric][
            'reason'] = 'could not load training data variables'
        return motif_analysis

    full_duration = metric_vars_dict['metric_vars']['full_duration']

    # Determine the metric details from the database
    metric_id = 0
    metric_db_object = {}
    try:
        metric_db_object = get_metrics_db_object(metric)
    except Exception as e:
        logger.error('error :: %s :: failed to get_metrics_db_object - %s' %
                     (function_str, e))
    try:
        metric_id = int(metric_db_object['id'])
    except Exception as e:
        logger.error(
            'error :: %s :: failed to determine metric_id from metric_db_object %s - %s'
            % (function_str, str(metric_db_object), e))
        metric_id = 0
    if not metric_id:
        logger.error(
            'error :: %s :: failed to get metric id for %s from the database' %
            (function_str, str(metric)))
        fail_msg = 'failed to get metric id'
        motif_analysis[metric]['status'] = 'error'
        motif_analysis[metric]['reason'] = 'could not determine metric id'
        return motif_analysis, fail_msg, trace

    # @modified 20210419 - Feature #4014: Ionosphere - inference
    # Create a unique dir for each batch_size max_distance
    # motif_images_dir = '%s/motifs' % metric_timeseries_dir
    motif_images_dir = '%s/motifs/batch_size.%s/top_matches.%s/max_distance.%s' % (
        metric_timeseries_dir, str(batch_size), str(top_matches),
        str(max_distance))

    if not path.exists(motif_images_dir):
        # provision motifs image resources
        mkdir_p(motif_images_dir)

    full_durations = [full_duration]
    if path.isfile(full_duration_timeseries_json):
        full_durations = [full_duration, settings.FULL_DURATION]
    logger.info('%s :: full_durations - %s' %
                (function_str, str(full_durations)))

    # Loop through analysis per full_duration
    for full_duration in full_durations:
        start_full_duration = timer()
        fp_ids = []
        try:
            query = 'SELECT id,last_matched from ionosphere WHERE metric_id=%s AND full_duration=%s AND enabled=1 ORDER BY last_matched DESC' % (
                str(metric_id), str(full_duration))
            results = mysql_select(skyline_app, query)
            for row in results:
                fp_ids.append(int(row[0]))
        except Exception as e:
            logger.error(
                'error :: %s :: failed to get fp ids via mysql_select from %s - %s'
                % (function_str, metric, e))

        logger.info('%s :: metric_id: %s, full_duration: %s, fp_ids: %s' %
                    (function_str,
                     (metric_id), str(full_duration), str(fp_ids)))

        if not fp_ids:
            continue

        # Now there are known fps, load the timeseries
        if full_duration == settings.FULL_DURATION:
            timeseries_json_file = full_duration_timeseries_json
        else:
            timeseries_json_file = timeseries_json
        try:
            with open((timeseries_json_file), 'r') as f:
                raw_timeseries = f.read()
            timeseries_array_str = str(raw_timeseries).replace('(',
                                                               '[').replace(
                                                                   ')', ']')
            del raw_timeseries
            timeseries = literal_eval(timeseries_array_str)
            del timeseries_array_str
        except Exception as e:
            logger.error(
                'error :: %s :: failed to load timeseries for %s from %s - %s'
                % (function_str, metric, timeseries_json_file, e))
            continue

        anomalous_timeseries_subsequence = []
        for timestamp_float, value in timeseries[-int(batch_size):]:
            anomalous_timeseries_subsequence.append(
                [int(timestamp_float), value])

        logger.info(
            '%s :: looking for motif in trained fps of full_duration: %s' %
            (function_str, (full_duration)))
        dataset = [float(item[1]) for item in anomalous_timeseries_subsequence]

        max_y = max(dataset)
        min_y = min(dataset)

        # full_y_range = max_y - min_y

        # range_padding_percent = range_padding
        # This was just a test that did not have the desired results
        # if full_y_range < 10:
        #     range_padding_percent = 35
        # if full_y_range < 5:
        #     range_padding_percent = 75
        # if full_y_range < 2:
        #    range_padding_percent = 100

        use_range_padding = ((max_y - min_y) / 100) * range_padding
        if min_y > 0 and (min_y - use_range_padding) > 0:
            min_y_padded = min_y - use_range_padding
        else:
            min_y_padded = min_y
        max_y_padded = max_y + use_range_padding
        if min_y_padded == max_y_padded:
            min_y_padded = min_y_padded - (
                (min_y_padded / 100) * range_padding)
            max_y_padded = max_y_padded + (
                (max_y_padded / 100) * range_padding)

        # anomalous_ts = np.array(dataset)
        anomalous_ts = dataset

        mass2_batch_times = []
        exact_match_times = []

        nan = np.array([np.nan])
        nanj = complex(0.0, float('nan'))
        empty_dists = np.array(nan + nanj)

        # plotted = False
        count = 0

        # fp_ids = [fp_id for index, fp_id in enumerate(fp_ids) if index == 0]

        # motifs_found = []
        # exact_matches_found = []
        # fps_timeseries = {}

        for fp_id in fp_ids:
            if (time.time() - start) >= 20:
                break
            # Attempt to surface the fp timeseries from memcache and/or db
            # @modified 20210424 - Feature #4014: Ionosphere - inference
            #                      Task #4030: refactoring
            fp_timeseries = None
            try:
                fp_timeseries = get_fp_timeseries(skyline_app, metric_id,
                                                  fp_id)
            except Exception as e:
                logger.error(
                    'inference :: did not get fp timeseries with get_fp_timeseries(%s, %s, %s) - %s'
                    % (skyline_app, str(metric_id), str(fp_id), e))
            if not fp_timeseries:
                continue

            relate_dataset = [float(item[1]) for item in fp_timeseries]

            fps_timeseries[fp_id] = fp_timeseries

            current_best_indices = []
            current_best_dists = []
            best_indices = None
            best_dists = None

            try:
                logger.info(
                    '%s :: running mts.mass2_batch fp_id: %s, full_duration: %s, batch_size: %s, top_matches: %s, max_distance: %s, motif_size: %s'
                    % (function_str, str(fp_id), str(full_duration),
                       str(batch_size), str(top_matches), str(max_distance),
                       str(len(anomalous_ts))))

                # @added 20210418 - Feature #4014: Ionosphere - inference
                # Handle top_matches being greater than possible kth that can be found
                # mts.mass2_batch error: kth(=50) out of bounds (16)
                use_top_matches = int(top_matches)
                if (len(fp_timeseries) / int(batch_size)) <= int(top_matches):
                    use_top_matches = round(
                        len(fp_timeseries) / int(batch_size)) - 1
                    if use_top_matches == 2:
                        use_top_matches = 1
                    logger.info(
                        '%s :: adjusting top_matches to %s (the maximum possible top - 1) as kth(=%s) will be out of bounds mts.mass2_batch'
                        %
                        (function_str, str(use_top_matches), str(top_matches)))

                start_mass2_batch = timer()
                # @modified 20210418 - Feature #4014: Ionosphere - inference
                # Handle top_matches being greater than possible kth that can be found
                # best_indices, best_dists = mts.mass2_batch(relate_dataset, anomalous_ts, batch_size=int(batch_size), top_matches=int(top_matches))
                best_indices, best_dists = mts.mass2_batch(
                    relate_dataset,
                    anomalous_ts,
                    batch_size=int(batch_size),
                    top_matches=int(use_top_matches))
                end_mass2_batch = timer()
                mass2_batch_times.append((end_mass2_batch - start_mass2_batch))
                current_best_indices = best_indices.tolist()
                current_best_dists = best_dists.tolist()

                # @added 20210412 - Feature #4014: Ionosphere - inference
                #                   Branch #3590: inference
                # Add fp_id to fps_checked_for_motifs to enable ionosphere to update the
                # motif related columns in the ionosphere database table
                fps_checked_for_motifs.append(fp_id)
            except Exception as e:
                logger.error('error :: %s :: %s mts.mass2_batch error: %s' %
                             (function_str, (fp_id), str(e)))
                continue

            try:
                if str(list(best_dists)) == str(list(empty_dists)):
                    logger.info(
                        '%s :: mts.mass2_batch no similar motif from fp id %s - best_dists: %s'
                        % (function_str, (fp_id), str(list(best_dists))))
                    continue
            except Exception as e:
                dev_null = e

            if not current_best_indices[0]:
                continue
            # if list(best_indices)[0] != anomalous_index:
            #     continue
            # If the best_dists is > 1 they are not very similar
            # if list(best_dists)[0].real > 1.0:
            #     continue
            # if list(best_indices)[0] and best_dists:
            for index, best_dist in enumerate(current_best_dists):
                try:
                    motif_added = False
                    """
                    Note: mass_ts finds similar motifs NOT the same motif, the same motif
                    will result in the best_dists being a nan+nanj
                    So it is DIYed
                    """
                    try:
                        # @modified 20210414 - Feature #4014: Ionosphere - inference
                        #                      Branch #3590: inference
                        # Store the not anomalous motifs
                        # motif = [fp_id, current_best_indices[index], best_dist.real]
                        motif = [
                            fp_id, current_best_indices[index], best_dist.real,
                            anomalous_timeseries_subsequence, full_duration
                        ]
                    except Exception as e:
                        dev_null = e
                        motif = []

                    # if list(best_indices)[0] and best_dists:
                    # If it is greater than 1.0 it is not similar
                    # if best_dist.real > 1.0:
                    # if best_dist.real > IONOSPHERE_INFERENCE_MASS_TS_MAX_DISTANCE:
                    if best_dist.real > float(max_distance):
                        continue
                    else:
                        if motif:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                    if not motif_added:
                        if best_dist == nanj:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                    if not motif_added:
                        if str(best_dist) == 'nan+nanj':
                            count += 1
                            motifs_found.append([
                                fp_id, current_best_indices[index], 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                            motif_added = True
                    if not motif_added:
                        if best_dist == empty_dists:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                except Exception as e:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: %s :: could not determine is if fp id %s timeseries at index %s was a match - %s'
                        % (function_str, str(fp_id),
                           str(current_best_indices[index]), e))
                    continue

            # FIND EXACT MATCHES
            # Seeing as I cannot reproduce finding nan+nanj which represents an
            # exact match with mts.mass2_batch, do it DIY style - iterate the
            # timeseries and create a batch_size subsequence for every index and
            # compare the values to the anomalous_ts for an exact match.
            # This takes ~0.024850 seconds on a timeseries with 10079 datapoints
            # @modified 20210418 - Feature #4014: Ionosphere - inference
            # However fiding exact matches can add ~2.5 seconds on 90 minute
            # batch_size and with a proproptionally scaled max_distance of say 15
            # finding an exact match in a longer sequence is less important,
            # the greater the batch_size the most likely greater the variability
            # and the chance of an exact match decreases.  So save 2.5 seconds.
            # UNLESS
            # At a 5 (to 10) batch_size and max_distance of 1.0 an exact match
            # can be found. Exact matches are quite frequent and sometimes with
            # such little variability, similar matchs may not be found.
            # Therefore find find_exact_matches has its place.  MASS
            # A CAVEAT here is that boring metrics and that change and have a
            # low variability even at a larger batch_size could also benefit and
            # possibly achieve better accruracy from the use of find_exact_matches
            # as they can be shapelets resemble a batch_size 5 shapelet.
            # It would perhaps be possible to use one or more of the features
            # profile tsfresh values to identify these types of shapelets, if
            # you knew which feature/s were most descriptive of this type of
            # shapelet, e.g. 'value__skewness': 3.079477685394873, etc (maybe)
            # However I predict that this method will perform worst on these
            # types of shapelets.
            # find_exact_matches = False
            # exact matches can be found in batch sizes of 500 and similar not
            # So actually always run it.
            find_exact_matches = True
            find_exact_matches_run = True

            if int(batch_size) < 10:
                find_exact_matches = True
                find_exact_matches_run = True

            if find_exact_matches:
                try:
                    start_exact_match = timer()
                    indexed_relate_dataset = []
                    for index, item in enumerate(relate_dataset):
                        indexed_relate_dataset.append([index, item])
                    last_index = indexed_relate_dataset[-1][0]
                    current_index = 0
                    while current_index < last_index:
                        subsequence = [
                            value for index, value in
                            indexed_relate_dataset[current_index:(
                                current_index + int(batch_size))]
                        ]
                        if subsequence == anomalous_ts:
                            exact_matches_found.append([
                                fp_id, current_index, 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                            motifs_found.append([
                                fp_id, current_index, 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                        current_index += 1
                    end_exact_match = timer()
                    exact_match_times.append(
                        (end_exact_match - start_exact_match))
                except Exception as e:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: %s :: could not determine it any exact matches could be found in fp id %s timeseries - %s'
                        % (function_str, str(fp_id), e))

        logger.info(
            '%s :: mts.mass2_batch runs on %s fps of full_duration %s in %.6f seconds'
            % (function_str, str(len(mass2_batch_times)), str(full_duration),
               sum(mass2_batch_times)))
        if find_exact_matches_run:
            logger.info(
                '%s :: exact_match runs on %s fps of full_duration %s in %.6f seconds'
                % (function_str, str(len(exact_match_times)),
                   str(full_duration), sum(exact_match_times)))
        end_full_duration = timer()
        logger.info(
            '%s :: analysed %s fps of full_duration %s in %.6f seconds' %
            (function_str, str(len(fp_ids)), str(full_duration),
             (end_full_duration - start_full_duration)))

        # Patterns are sorted
        sorted_motifs = []
        motifs_found_in_fps = []
        if motifs_found:
            sorted_motifs = sorted(motifs_found, key=lambda x: x[2])
            for item in sorted_motifs:
                motifs_found_in_fps.append(item[0])
        logger.info('%s :: %s motifs found' %
                    (function_str, str(len(sorted_motifs))))

        for motif in sorted_motifs:
            if (time.time() - start) >= 25:
                break
            try:
                add_match = False
                all_in_range = False

                fp_id = motif[0]
                best_index = motif[1]
                best_dist = motif[2]

                # @added 20210414 - Feature #4014: Ionosphere - inference
                #                   Branch #3590: inference
                # Store the not anomalous motifs
                motif_sequence = motif[3]

                motif_full_duration = motif[4]

                match_type = 'not_similar_enough'

                if motif in exact_matches_found:
                    add_match = True
                    match_type = 'exact'
                    all_in_range = True
                    exact_motifs += 1
                full_relate_timeseries = fps_timeseries[fp_id]
                # full_relate_dataset = [float(item[1]) for item in full_relate_timeseries]
                relate_timeseries = [
                    item for index, item in enumerate(full_relate_timeseries)
                    if index >= best_index and index < (best_index +
                                                        int(batch_size))
                ]
                relate_dataset = [item[1] for item in relate_timeseries]

                if not add_match:
                    all_in_range = True
                    for value in relate_dataset:
                        if value < min_y_padded:
                            all_in_range = False
                            break
                        if value > max_y_padded:
                            all_in_range = False
                            break
                    if all_in_range:
                        related_max_y = max(relate_dataset)
                        if related_max_y < (max_y - range_padding):
                            all_in_range = False
                        if related_max_y > (max_y + range_padding):
                            all_in_range = False
                        related_min_y = min(relate_dataset)
                        if related_min_y < (min_y - range_padding):
                            all_in_range = False
                        if related_min_y > (min_y + range_padding):
                            all_in_range = False
                    if all_in_range:
                        logger.info(
                            '%s :: ALL IN RANGE - all_in_range: %s, motif: %s'
                            % (function_str, str(all_in_range),
                               str(relate_dataset[0:2])))
                        add_match = True
                        match_type = 'all_in_range'
                        similar_motifs += 1

                    # @added 20210425 - Feature #4014: Ionosphere - inference
                    # Compute the area using the composite trapezoidal rule.
                    motif_area = None
                    fp_motif_area = None
                    percent_different = None
                    try:
                        batch_size_dataset = [
                            float(item[1]) for item in motif_sequence
                        ]
                        y_motif = np.array(batch_size_dataset)
                        motif_area = np.trapz(y_motif, dx=1)
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get motif_area with np.trapz - %s'
                            % (function_str, e))
                    try:
                        y_fp_motif = np.array(relate_dataset)
                        fp_motif_area = np.trapz(y_fp_motif, dx=1)
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get fp_motif_area with np.trapz - %s'
                            % (function_str, e))
                    # Determine the percentage difference (as a
                    # positive value) of the areas under the
                    # curves.
                    if motif_area and fp_motif_area:
                        percent_different = get_percent_different(
                            fp_motif_area, motif_area, True)
                        if percent_different > max_area_percent_diff:
                            if add_match:
                                logger.info(
                                    '%s :: AREA TOO DIFFERENT - not adding all_in_range match'
                                    % (function_str))
                                add_match = False
                            # BUT ...
                            if best_dist < 3 and not add_match:
                                logger.info(
                                    '%s :: DISTANCE VERY SIMILAR - adding match even though area_percent_diff is greater than max_area_percent_diff because best_dist: %s'
                                    % (function_str, str(best_dist)))
                                add_match = True
                                match_type = 'distance'
                                distance_motifs += 1

                if similarity == 'all':
                    if not add_match:
                        not_similar_motifs += 1
                        if not_similar_enough_sample >= 10:
                            continue
                        not_similar_enough_sample += 1
                        add_match = True
                        match_type = 'not_similar_enough'

                if add_match:
                    generation = 0
                    fp_id_row = None
                    try:
                        fp_id_row = get_ionosphere_fp_db_row(
                            skyline_app, int(fp_id))
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get_ionosphere_fp_db_row for fp_id %s - %s'
                            % (function_str, str(fp_id), e))
                    if fp_id_row:
                        try:
                            generation = fp_id_row['generation']
                        except Exception as e:
                            logger.error(
                                'error :: %s :: failed to generation from fp_id_row for fp_id %s - %s'
                                % (function_str, str(fp_id), e))
                    if generation == 0:
                        generation_str = 'trained'
                    else:
                        generation_str = 'LEARNT'
                    motif_match_types = motif_match_types_dict()
                    type_id = motif_match_types[match_type]

                    motif_id = '%s-%s' % (str(fp_id), str(best_index))
                    motif_analysis[metric]['motifs'][motif_id] = {}
                    motif_analysis[metric]['motifs'][motif_id][
                        'metric_id'] = metric_id
                    motif_analysis[metric]['motifs'][motif_id]['fp_id'] = fp_id
                    motif_analysis[metric]['motifs'][motif_id][
                        'generation'] = generation
                    motif_analysis[metric]['motifs'][motif_id][
                        'index'] = best_index
                    motif_analysis[metric]['motifs'][motif_id][
                        'distance'] = best_dist
                    motif_analysis[metric]['motifs'][motif_id]['size'] = int(
                        batch_size)
                    motif_analysis[metric]['motifs'][motif_id][
                        'max_distance'] = float(max_distance)
                    motif_analysis[metric]['motifs'][motif_id][
                        'timestamp'] = timestamp
                    motif_analysis[metric]['motifs'][motif_id][
                        'type_id'] = type_id
                    motif_analysis[metric]['motifs'][motif_id][
                        'type'] = match_type
                    motif_analysis[metric]['motifs'][motif_id][
                        'full_duration'] = motif_full_duration
                    # @added 20210414 - Feature #4014: Ionosphere - inference
                    #                   Branch #3590: inference
                    # Store the not anomalous motifs
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_timeseries'] = anomalous_timeseries_subsequence
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_sequence'] = motif_sequence
                    not_anomalous_timestamp = int(
                        anomalous_timeseries_subsequence[-1][0])
                    graph_period_seconds = not_anomalous_timestamp - int(
                        anomalous_timeseries_subsequence[0][0])
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_period_seconds'] = graph_period_seconds
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_period_minutes'] = round(graph_period_seconds /
                                                        60)

                    motif_analysis[metric]['motifs'][motif_id]['image'] = None

                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_area'] = motif_area
                    motif_analysis[metric]['motifs'][motif_id][
                        'fp_motif_area'] = fp_motif_area
                    motif_analysis[metric]['motifs'][motif_id][
                        'area_percent_diff'] = percent_different
                    motif_analysis[metric]['motifs'][motif_id][
                        'max_area_percent_diff'] = max_area_percent_diff

                    if (time.time() - start) >= 25:
                        continue

                    graph_image_file = '%s/motif.%s.%s.%s.with_max_distance.%s.png' % (
                        motif_images_dir, motif_id, match_type,
                        str(batch_size), str(max_distance))
                    plotted_image = False
                    on_demand_motif_analysis = True
                    if not path.isfile(graph_image_file):
                        plotted_image, plotted_image_file = plot_motif_match(
                            skyline_app, metric, timestamp, fp_id,
                            full_duration, generation_str, motif_id,
                            best_index, int(batch_size), best_dist, type_id,
                            relate_dataset, anomalous_timeseries_subsequence,
                            graph_image_file, on_demand_motif_analysis)
                    else:
                        plotted_image = True
                        logger.info('%s :: plot already exists - %s' %
                                    (function_str, str(graph_image_file)))
                    if plotted_image:
                        motif_analysis[metric]['motifs'][motif_id][
                            'image'] = graph_image_file
                    else:
                        logger.error('failed to plot motif match plot')
                        graph_image_file = None
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: inference :: with fp id %s proceesing motif at index: %s - %s'
                    % (str(fp_id), str(motif[0]), str(e)))
                continue
    end_timer = timer()
    motif_analysis[metric]['fps_checked'] = fps_checked_for_motifs
    motif_analysis[metric]['exact_motifs'] = exact_motifs
    motif_analysis[metric]['similar_motifs'] = similar_motifs
    motif_analysis[metric]['distance_motifs'] = distance_motifs
    motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs
    motif_analysis[metric][
        'not_similar_enough_sample'] = not_similar_enough_sample

    motif_analysis_file = '%s/motif.analysis.similarity_%s.batch_size_%s.top_matches_%s.max_distance_%s.dict' % (
        motif_images_dir, similarity, str(batch_size), str(top_matches),
        str(max_distance))
    try:
        write_data_to_file(skyline_app, motif_analysis_file, 'w',
                           str(motif_analysis))
    except Exception as e:
        trace = traceback.format_exc()
        logger.error('%s' % trace)
        fail_msg = '%s :: error :: failed to write motif_analysis_file - %s' % (
            function_str, motif_analysis_file)
        logger.error('%s' % fail_msg)
        dev_null = e

    motif_ids = list(motif_analysis[metric]['motifs'].keys())
    logger.info(
        '%s :: %s motif matches found, %s fps where checked and motifs plotted in %.6f seconds for %s'
        % (function_str, str(len(motif_ids)), str(len(fps_checked_for_motifs)),
           (end_timer - start_timer), metric))
    if dev_null:
        del dev_null
    return motif_analysis, fail_msg, trace
Пример #15
0
    def spin_process(self, i, run_timestamp):
        """
        Assign a metric for a process to analyze.
        """

        # Discover metric to analyze
        metric_var_files = [
            f for f in listdir(settings.MIRAGE_CHECK_PATH)
            if isfile(join(settings.MIRAGE_CHECK_PATH, f))
        ]

        # Check if this process is unnecessary
        if len(metric_var_files) == 0:
            return

        metric_var_files_sorted = sorted(metric_var_files)
        metric_check_file = '%s/%s' % (settings.MIRAGE_CHECK_PATH,
                                       str(metric_var_files_sorted[0]))

        # Load metric variables
        # @modified 20160822 - Bug #1460: panorama check file fails
        # Changed to panorama style skyline_functions load_metric_vars
        # self.load_metric_vars(metric_check_file)
        # Load and validate metric variables
        try:
            metric_vars = load_metric_vars(skyline_app, str(metric_check_file))
        except:
            logger.info(traceback.format_exc())
            logger.error(
                'error :: failed to load metric variables from check file - %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return

        # Test metric variables
        if len(metric_vars.metric) == 0:
            return
        else:
            metric = metric_vars.metric
            metric_name = ['metric_name', metric_vars.metric]
            self.metric_variables.append(metric_name)
        if len(metric_vars.value) == 0:
            return
        else:
            metric_value = ['metric_value', metric_vars.value]
            self.metric_variables.append(metric_value)
        if len(metric_vars.hours_to_resolve) == 0:
            return
        else:
            hours_to_resolve = [
                'hours_to_resolve', metric_vars.hours_to_resolve
            ]
            self.metric_variables.append(hours_to_resolve)
        if len(metric_vars.metric_timestamp) == 0:
            return
        else:
            metric_timestamp = [
                'metric_timestamp', metric_vars.metric_timestamp
            ]
            self.metric_variables.append(metric_timestamp)

        # Ignore any metric check with a timestamp greater than 10 minutes ago
        int_metric_timestamp = int(metric_vars.metric_timestamp)
        int_run_timestamp = int(run_timestamp)
        metric_timestamp_age = int_run_timestamp - int_metric_timestamp
        if metric_timestamp_age > settings.MIRAGE_STALE_SECONDS:
            logger.info(
                'stale check :: %s check request is %s seconds old - discarding'
                % (metric_vars.metric, metric_timestamp_age))
            # Remove metric check file
            #            try:
            #                os.remove(metric_check_file)
            #            except OSError:
            #                pass
            #            return
            if os.path.exists(metric_check_file):
                os.remove(metric_check_file)
                logger.info('removed %s' % (metric_check_file))
            else:
                logger.info('could not remove %s' % (metric_check_file))

        # Calculate hours second order resolution to seconds
        second_order_resolution_seconds = int(
            metric_vars.hours_to_resolve) * 3600

        # Calculate graphite from and until parameters from the metric timestamp
        graphite_until = datetime.datetime.fromtimestamp(
            int(metric_vars.metric_timestamp)).strftime('%H:%M_%Y%m%d')
        int_second_order_resolution_seconds = int(
            second_order_resolution_seconds)
        second_resolution_timestamp = int_metric_timestamp - int_second_order_resolution_seconds
        graphite_from = datetime.datetime.fromtimestamp(
            int(second_resolution_timestamp)).strftime('%H:%M_%Y%m%d')

        # Remove any old json file related to the metric
        metric_json_file = '%s/%s/%s.json' % (settings.MIRAGE_DATA_FOLDER,
                                              str(metric_vars.metric),
                                              str(metric_vars.metric))
        try:
            os.remove(metric_json_file)
        except OSError:
            pass

        # Get data from graphite
        logger.info(
            'retrieve data :: surfacing %s timeseries from graphite for %s seconds'
            % (metric_vars.metric, second_order_resolution_seconds))
        self.surface_graphite_metric_data(metric_vars.metric, graphite_from,
                                          graphite_until)

        # Check there is a json timeseries file to test
        if not os.path.isfile(metric_json_file):
            logger.error(
                'error :: retrieve failed - failed to surface %s timeseries from graphite'
                % (metric_vars.metric))
            # Remove metric check file
            try:
                os.remove(metric_check_file)
            except OSError:
                pass
            return
        else:
            logger.info('retrieved data :: for %s at %s seconds' %
                        (metric_vars.metric, second_order_resolution_seconds))

        # Make process-specific dicts
        exceptions = defaultdict(int)
        anomaly_breakdown = defaultdict(int)

        self.check_if_parent_is_alive()

        with open((metric_json_file), 'r') as f:
            timeseries = json.loads(f.read())
            logger.info('data points surfaced :: %s' % (len(timeseries)))

        try:
            logger.info('analyzing :: %s at %s seconds' %
                        (metric_vars.metric, second_order_resolution_seconds))
            anomalous, ensemble, datapoint = run_selected_algorithm(
                timeseries, metric_vars.metric,
                second_order_resolution_seconds)

            # If it's anomalous, add it to list
            if anomalous:
                base_name = metric.replace(settings.FULL_NAMESPACE, '', 1)
                anomalous_metric = [datapoint, base_name]
                self.anomalous_metrics.append(anomalous_metric)
                logger.info('anomaly detected  :: %s with %s' %
                            (metric_vars.metric, metric_vars.value))
                # It runs so fast, this allows us to process 30 anomalies/min
                sleep(2)

                # Get the anomaly breakdown - who returned True?
                triggered_algorithms = []
                for index, value in enumerate(ensemble):
                    if value:
                        algorithm = settings.MIRAGE_ALGORITHMS[index]
                        anomaly_breakdown[algorithm] += 1
                        triggered_algorithms.append(algorithm)

                # If Crucible or Panorama are enabled determine details
                determine_anomaly_details = False
                if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED:
                    determine_anomaly_details = True
                if settings.PANORAMA_ENABLED:
                    determine_anomaly_details = True

                if determine_anomaly_details:
                    metric_timestamp = str(int(timeseries[-1][0]))
                    from_timestamp = str(int(timeseries[1][0]))
                    timeseries_dir = base_name.replace('.', '/')

                # If Panorama is enabled - create a Panorama check
                if settings.PANORAMA_ENABLED:
                    if not os.path.exists(settings.PANORAMA_CHECK_PATH):
                        if python_version == 2:
                            mode_arg = int('0755')
                        if python_version == 3:
                            mode_arg = mode = 0o755
                        os.makedirs(settings.PANORAMA_CHECK_PATH, mode_arg)

                    # Note:
                    # The values are enclosed is single quoted intentionally
                    # as the imp.load_source used results in a shift in the
                    # decimal position when double quoted, e.g.
                    # value = "5622.0" gets imported as
                    # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
                    # single quoting results in the desired,
                    # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0
                    added_at = str(int(time()))
                    source = 'graphite'
                    panaroma_anomaly_data = 'metric = \'%s\'\n' \
                                            'value = \'%s\'\n' \
                                            'from_timestamp = \'%s\'\n' \
                                            'metric_timestamp = \'%s\'\n' \
                                            'algorithms = %s\n' \
                                            'triggered_algorithms = %s\n' \
                                            'app = \'%s\'\n' \
                                            'source = \'%s\'\n' \
                                            'added_by = \'%s\'\n' \
                                            'added_at = \'%s\'\n' \
                        % (base_name, str(datapoint), from_timestamp,
                           metric_timestamp, str(settings.MIRAGE_ALGORITHMS),
                           triggered_algorithms, skyline_app, source,
                           this_host, added_at)

                    # Create an anomaly file with details about the anomaly
                    panaroma_anomaly_file = '%s/%s.%s.txt' % (
                        settings.PANORAMA_CHECK_PATH, added_at, base_name)
                    try:
                        write_data_to_file(skyline_app, panaroma_anomaly_file,
                                           'w', panaroma_anomaly_data)
                        logger.info('added panorama anomaly file :: %s' %
                                    (panaroma_anomaly_file))
                    except:
                        logger.error(
                            'error :: failed to add panorama anomaly file :: %s'
                            % (panaroma_anomaly_file))
                        logger.info(traceback.format_exc())

                # If crucible is enabled - save timeseries and create a
                # crucible check
                if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED:
                    metric_timestamp = str(int(timeseries[-1][0]))
                    from_timestamp = str(int(timeseries[1][0]))
                    timeseries_dir = base_name.replace('.', '/')
                    crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp
                    if not os.path.exists(crucible_anomaly_dir):
                        if python_version == 2:
                            mode_arg = int('0755')
                        if python_version == 3:
                            mode_arg = mode = 0o755
                        os.makedirs(crucible_anomaly_dir, mode_arg)

                    # Note:
                    # The value is enclosed is single quoted intentionally
                    # as the imp.load_source used in crucible results in a
                    # shift in the decimal position when double quoted, e.g.
                    # value = "5622.0" gets imported as
                    # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
                    # single quoting results in the desired,
                    # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0

                    crucible_anomaly_data = 'metric = \'%s\'\n' \
                                            'value = \'%s\'\n' \
                                            'from_timestamp = \'%s\'\n' \
                                            'metric_timestamp = \'%s\'\n' \
                                            'algorithms = %s\n' \
                                            'triggered_algorithms = %s\n' \
                                            'anomaly_dir = \'%s\'\n' \
                                            'graphite_metric = True\n' \
                                            'run_crucible_tests = False\n' \
                                            'added_by = \'%s\'\n' \
                                            'added_at = \'%s\'\n' \
                        % (base_name, str(datapoint), from_timestamp,
                           metric_timestamp, str(settings.MIRAGE_ALGORITHMS),
                           triggered_algorithms, crucible_anomaly_dir,
                           skyline_app, metric_timestamp)

                    # Create an anomaly file with details about the anomaly
                    crucible_anomaly_file = '%s/%s.txt' % (
                        crucible_anomaly_dir, base_name)
                    try:
                        write_data_to_file(skyline_app, crucible_anomaly_file,
                                           'w', crucible_anomaly_data)
                        logger.info('added crucible anomaly file :: %s' %
                                    (crucible_anomaly_file))
                    except:
                        logger.error(
                            'error :: failed to add crucible anomaly file :: %s'
                            % (crucible_anomaly_file))
                        logger.info(traceback.format_exc())

                    # Create timeseries json file with the timeseries
                    json_file = '%s/%s.json' % (crucible_anomaly_dir,
                                                base_name)
                    timeseries_json = str(timeseries).replace('[',
                                                              '(').replace(
                                                                  ']', ')')
                    try:
                        write_data_to_file(skyline_app, json_file, 'w',
                                           timeseries_json)
                        logger.info('added crucible timeseries file :: %s' %
                                    (json_file))
                    except:
                        logger.error(
                            'error :: failed to add crucible timeseries file :: %s'
                            % (json_file))
                        logger.info(traceback.format_exc())

                    # Create a crucible check file
                    crucible_check_file = '%s/%s.%s.txt' % (
                        settings.CRUCIBLE_CHECK_PATH, metric_timestamp,
                        base_name)
                    try:
                        write_data_to_file(skyline_app, crucible_check_file,
                                           'w', crucible_anomaly_data)
                        logger.info('added crucible check :: %s,%s' %
                                    (base_name, metric_timestamp))
                    except:
                        logger.error(
                            'error :: failed to add crucible check file :: %s'
                            % (crucible_check_file))
                        logger.info(traceback.format_exc())
            else:
                base_name = metric.replace(settings.FULL_NAMESPACE, '', 1)
                not_anomalous_metric = [datapoint, base_name]
                self.not_anomalous_metrics.append(not_anomalous_metric)
                logger.info('not anomalous     :: %s with %s' %
                            (metric_vars.metric, metric_vars.value))

        # It could have been deleted by the Roomba
        except TypeError:
            exceptions['DeletedByRoomba'] += 1
            logger.info('exceptions        :: DeletedByRoomba')
        except TooShort:
            exceptions['TooShort'] += 1
            logger.info('exceptions        :: TooShort')
        except Stale:
            exceptions['Stale'] += 1
            logger.info('exceptions        :: Stale')
        except Boring:
            exceptions['Boring'] += 1
            logger.info('exceptions        :: Boring')
        except:
            exceptions['Other'] += 1
            logger.info('exceptions        :: Other')
            logger.info(traceback.format_exc())

        # Add values to the queue so the parent process can collate
        for key, value in anomaly_breakdown.items():
            self.mirage_anomaly_breakdown_q.put((key, value))

        for key, value in exceptions.items():
            self.mirage_exceptions_q.put((key, value))

        metric_var_files = []
        timeseries = []

        # Remove metric check file
        try:
            os.remove(metric_check_file)
        except OSError:
            pass