示例#1
0
def thunder_send_event(current_skyline_app, event, log=True):
    """
    Add an event to the thunder.events Redis set or the thunder check dir if
    Redis is not available.

    :param current_skyline_app: the app calling the function
    :param event: the event data
    :param log: whether to log or not, optional, defaults to False
    :type current_skyline_app: str
    :type event: dict
    :type log: boolean
    :return: submitted
    :rtype: boolean

    """

    function_str = 'functions.thunder.thunder_sent_event'
    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
    else:
        current_logger = None

    submitted = 0
    try:
        redis_conn = get_redis_conn(current_skyline_app)
        submitted = redis_conn.sadd('thunder.events', str(event))
        if submitted:
            return True
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to add %s to thunder.events Redis set - %s'
            % (function_str, str(event), e))

    # If the thunder event was not added to Redis set, create the event_file
    if not path.exists(THUNDER_EVENTS_DIR):
        mkdir_p(THUNDER_EVENTS_DIR)
        current_logger.info('created dir - %s' % THUNDER_EVENTS_DIR)
    event_file = '%s/%s.thunder.event.dict' % (THUNDER_EVENTS_DIR, str(time()))
    try:
        write_data_to_file(current_skyline_app, event_file, 'w', str(event))
        current_logger.info('added thunder event file - %s' % event_file)
        submitted = True
    except Exception as e:
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: failed to add thunder event file - %s - %s' %
            (event_file, e))
        submitted = False

    return submitted
示例#2
0
def adtk_level_shift(current_skyline_app, parent_pid, timeseries, algorithm_parameters):
    """
    A timeseries is anomalous if a level shift occurs in a 5 window period bound
    by a factor of 9 of the normal range based on historical interquartile range.

    :param current_skyline_app: the Skyline app executing the algorithm.  This
        will be passed to the algorithm by Skyline.  This is **required** for
        error handling and logging.  You do not have to worry about handling the
        argument in the scope of the custom algorithm itself,  but the algorithm
        must accept it as the first agrument.
    :param parent_pid: the parent pid which is executing the algorithm, this is
        **required** for error handling and logging.  You do not have to worry
        about handling this argument in the scope of algorithm, but the
        algorithm must accept it as the second argument.
    :param timeseries: the time series as a list e.g. ``[[1578916800.0, 29.0],
        [1578920400.0, 55.0], ... [1580353200.0, 55.0]]``
    :param algorithm_parameters: a dictionary of any required parameters for the
        custom_algorithm and algorithm itself.  For the matrixprofile custom
        algorithm the following parameters are required, example:
        ``algorithm_parameters={
            'c': 9.0,
            'run_every': 5,
            'side': 'both',
            'window': 5
        }``
    :type current_skyline_app: str
    :type parent_pid: int
    :type timeseries: list
    :type algorithm_parameters: dict
    :return: True, False or Non
    :rtype: boolean

    Performance is of paramount importance in Skyline, especially in terms of
    computational complexity, along with execution time and CPU usage. The
    adtk LevelShiftAD algortihm is not O(n) and it is not fast either, not when
    compared to the normal three-sigma triggered algorithms.  However it is
    useful if you care about detecting all level shifts.  The normal three-sigma
    triggered algorithms do not always detect a level shift, especially if the
    level shift does not breach the three-sigma limits.  Therefore you may find
    over time that you encounter alerts that contain level shifts that you
    thought should have been detected.  On these types of metrics and events,
    the adtk LevelShiftAD algortihm can be implemented to detect and alert on
    these.  It is not recommended to run on all your metrics as it would
    immediately triple the analyzer runtime every if only run every 5 windows/
    minutes.

    Due to the computational complexity and long run time of the adtk
    LevelShiftAD algorithm on the size of timeseries data used by Skyline, if
    you consider the following timings of all three-sigma triggered algorithms
    and compare them to the to the adtk_level_shift results in the last 2 rows
    of the below log, it is clear that the running adtk_level_shift on all
    metrics is probably not desirable, even if it is possible to do, it is very
    noisy.

    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - histogram_bins run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - histogram_bins has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - histogram_bins - total: 1.051136 - median: 0.001430
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - first_hour_average run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - first_hour_average has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - first_hour_average - total: 1.322432 - median: 0.001835
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - stddev_from_average run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - stddev_from_average has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - stddev_from_average - total: 1.097290 - median: 0.001641
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - grubbs run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - grubbs has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - grubbs - total: 1.742929 - median: 0.002438
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - ks_test run 147 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - ks_test has 147 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - ks_test - total: 0.127648 - median: 0.000529
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - mean_subtraction_cumulation run 40 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - mean_subtraction_cumulation has 40 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - mean_subtraction_cumulation - total: 0.152515 - median: 0.003152
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - median_absolute_deviation run 35 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - median_absolute_deviation has 35 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - median_absolute_deviation - total: 0.143770 - median: 0.003248
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - stddev_from_moving_average run 30 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - stddev_from_moving_average has 30 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - stddev_from_moving_average - total: 0.125173 - median: 0.003092
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - least_squares run 16 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - least_squares has 16 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - least_squares - total: 0.089108 - median: 0.005538
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - abs_stddev_from_median run 1 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - abs_stddev_from_median has 1 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - abs_stddev_from_median - total: 0.036797 - median: 0.036797
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - adtk_level_shift run 271 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - adtk_level_shift has 271 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - adtk_level_shift - total: 13.729565 - median: 0.035791
    ...
    ...
    2021-03-06 10:46:39 :: 1582754 :: seconds to run     :: 27.93  # THE TOTAL ANALYZER RUNTIME

    Therefore the analysis methodology implemented for the adtk_level_shift
    custom_algorithm is as folows:

    - When new metrics are added either to the configuration or by actual new
    metrics coming online that match the ``algorithm_parameters['namespace']``,
    Skyline implements sharding on new metrics into time slots to prevent a
    thundering herd situation from developing.  A newly added metrics will
    eventually be assigned into a time shard and be added and the last analysed
    timestamp will be added to the ``analyzer.last.adtk_level_shift`` Redis hash
    key to determine the next scheduled run with
    ``algorithm_parameters['namespace']``

    - A ``run_every`` parameter is implemented so that the algorithm can be
    configured to run on a metric once every ``run_every`` minutes.  The default
    is to run it every 5 minutes using window 5 (rolling) and trigger as
    anomalous if the algorithm labels any of the last 5 datapoints as anomalous.
    This means that there could be up to a 5 minute delay on an alert on the
    60 second, 168 SECOND_ORDER_RESOLUTION_HOURS metrics in the example, but a
    ``c=9.0`` level shift would be detected and would be alerted on (if both
    analyzer and mirage triggered on it).  This periodic running of the
    algorithm is a tradeoff so that the adtk_level_shift load and runtime can be
    spread over ``run_every`` minutes.

    - The algorithm is not run against metrics that are sparsely populated.
    When the algorithm is run on sparsely populated metrics it results in lots
    of false positives and noise.

    The Skyline CUSTOM_ALGORITHMS implementation of the adtk LevelShiftAD
    algorithm is configured as the example shown below.  However please note
    that the algorithm_parameters shown in this example configuration are
    suitiable for metrics that have a 60 second relation and have a
    :mod:`settings.ALERTS` Mirage SECOND_ORDER_RESOLUTION_HOURS of 168 (7 days).
    For metrics with a different resolution/frequency may require different
    values appropriate for metric resolution.

    :
    Example CUSTOM_ALGORITHMS configuration:

    'adtk_level_shift': {
        'namespaces': [
            'skyline.analyzer.run_time', 'skyline.analyzer.total_metrics',
            'skyline.analyzer.exceptions'
        ],
        'algorithm_source': '/opt/skyline/github/skyline/skyline/custom_algorithms/adtk_level_shift.py',
        'algorithm_parameters': {'c': 9.0, 'run_every': 5, 'side': 'both', 'window': 5},
        'max_execution_time': 0.5,
        'consensus': 1,
        'algorithms_allowed_in_consensus': ['adtk_level_shift'],
        'run_3sigma_algorithms': True,
        'run_before_3sigma': True,
        'run_only_if_consensus': False,
        'use_with': ["analyzer", "mirage"],
        'debug_logging': False,
    },

    """

    # You MUST define the algorithm_name
    algorithm_name = 'adtk_level_shift'

    # Define the default state of None and None, anomalous does not default to
    # False as that is not correct, False is only correct if the algorithm
    # determines the data point is not anomalous.  The same is true for the
    # anomalyScore.
    anomalous = None
    anomalyScore = None

    # @aded 20210308 - Feature #3978: luminosity - classify_metrics
    #                  Feature #3642: Anomaly type classification
    return_anomalies = False
    anomalies = []
    realtime_analysis = True

    current_logger = None

    # If you wanted to log, you can but this should only be done during
    # testing and development
    def get_log(current_skyline_app):
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        return current_logger

    start = timer()

    # Use the algorithm_parameters to determine the sample_period
    debug_logging = None
    try:
        debug_logging = algorithm_parameters['debug_logging']
    except:
        debug_logging = False
    if debug_logging:
        try:
            current_logger = get_log(current_skyline_app)
            current_logger.debug('debug :: %s :: debug_logging enabled with algorithm_parameters - %s' % (
                algorithm_name, str(algorithm_parameters)))
        except:
            # This except pattern MUST be used in ALL custom algortihms to
            # facilitate the traceback from any errors.  The algorithm we want to
            # run super fast and without spamming the log with lots of errors.
            # But we do not want the function returning and not reporting
            # anything to the log, so the pythonic except is used to "sample" any
            # algorithm errors to a tmp file and report once per run rather than
            # spewing tons of errors into the log e.g. analyzer.log
            record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc())
            # Return None and None as the algorithm could not determine True or False
            return (False, None)

    # Allow the LevelShiftAD window parameter to be passed in the
    # algorithm_parameters
    window = 5
    try:
        window = algorithm_parameters['window']
    except:
        pass

    # Allow the LevelShiftAD c parameter to be passed in the
    # algorithm_parameters
    c = 9.0
    try:
        c = algorithm_parameters['c']
    except:
        pass

    run_every = window
    try:
        run_every = algorithm_parameters['run_every']
    except:
        pass

    side = 'both'
    try:
        side = algorithm_parameters['side']
    except:
        pass

    if debug_logging:
        current_logger.debug('debug :: algorithm_parameters :: %s' % (
            str(algorithm_parameters)))

    # @added 20210308 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    try:
        return_anomalies = algorithm_parameters['return_anomalies']
    except:
        return_anomalies = False
    try:
        realtime_analysis = algorithm_parameters['realtime_analysis']
    except:
        realtime_analysis = True

    # @added 20210316 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    save_plots_to = False
    try:
        save_plots_to = algorithm_parameters['save_plots_to']
    except:
        pass

    # @added 20210323 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    save_plots_to_absolute_dir = False
    try:
        save_plots_to_absolute_dir = algorithm_parameters['save_plots_to_absolute_dir']
    except:
        pass
    filename_prefix = False
    try:
        filename_prefix = algorithm_parameters['filename_prefix']
    except:
        pass

    # @added 20210318 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    run_PersistAD = False
    try:
        run_PersistAD = algorithm_parameters['run_PersistAD']
    except:
        pass

    if debug_logging:
        current_logger.debug('debug :: algorithm_parameters :: %s' % (
            str(algorithm_parameters)))

    try:
        base_name = algorithm_parameters['base_name']
    except:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        if return_anomalies:
            return (False, None, anomalies)
        else:
            return (False, None)
    if debug_logging:
        current_logger.debug('debug :: %s :: base_name - %s' % (
            algorithm_name, str(base_name)))

    # Due to the load and runtime of LevelShiftAD it is only run in analyzer
    # periodically
    if current_skyline_app == 'analyzer':
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
        last_hash_key = 'analyzer.last.%s' % algorithm_name
        last_check = None
        try:
            raw_last_check = redis_conn_decoded.hget(last_hash_key, base_name)
            last_check = int(raw_last_check)
        except:
            last_check = None
        last_window_timestamps = [int(item[0]) for item in timeseries[-run_every:]]
        if last_check in last_window_timestamps:
            if debug_logging:
                current_logger.debug('debug :: %s :: run_every period is not over yet, skipping base_name - %s' % (
                    algorithm_name, str(base_name)))
            if return_anomalies:
                return (False, None, anomalies)
            else:
                return (False, None)

        # If there is no last timestamp, shard the metric, it will eventually
        # be added.
        if not last_check:
            now = datetime.datetime.now()
            now_seconds = int(now.second)
            if now_seconds == 0:
                now_seconds = 1
            period_seconds = int(60 / run_every)
            shard = int(period_seconds)
            last_shard = 60
            shard = int(period_seconds)
            shards = [shard]
            while shard < last_shard:
                shard = shard + period_seconds
                shards.append((shard))
            shard_value = round(now_seconds / shards[0]) * shards[0]
            if shard_value <= shards[0]:
                shard_value = shards[0]
            metric_as_bytes = str(base_name).encode()
            value = zlib.adler32(metric_as_bytes)
            shard_index = [(index + 1) for index, s_value in enumerate(shards) if s_value == shard_value][0]
            modulo_result = value % shard_index
            if modulo_result == 0:
                if debug_logging:
                    current_logger.debug('debug :: %s :: skipping as not sharded into this run - %s' % (
                        algorithm_name, str(base_name)))
            if return_anomalies:
                return (False, None, anomalies)
            else:
                return (False, None)
        if debug_logging:
            current_logger.debug('debug :: %s :: analysing %s' % (
                algorithm_name, str(base_name)))

        try:
            int_metric_timestamp = int(timeseries[-1][0])
        except:
            int_metric_timestamp = 0
        if int_metric_timestamp:
            try:
                redis_conn_decoded.hset(
                    last_hash_key, base_name,
                    int_metric_timestamp)
            except:
                pass

    # ALWAYS WRAP YOUR ALGORITHM IN try and the BELOW except
    try:
        start_preprocessing = timer()

        # INFO: Sorting time series of 10079 data points took 0.002215 seconds
        timeseries = sorted(timeseries, key=lambda x: x[0])
        if debug_logging:
            current_logger.debug('debug :: %s :: time series of length - %s' % (
                algorithm_name, str(len(timeseries))))

        # Testing the data to ensure it meets minimum requirements, in the case
        # of Skyline's use of the LevelShiftAD algorithm this means that:
        # - the time series must have at least 75% of its full_duration
        # - the time series must have at least 99% of the data points for the
        #   in the sample being analysed.
        do_not_use_sparse_data = False
        if current_skyline_app == 'analyzer':
            do_not_use_sparse_data = True

        # @added 20210305 - Feature #3970: custom_algorithm - adtk_level_shift
        #                   Task #3664:: POC with adtk
        # With mirage also do not run LevelShiftAD on sparsely populated data
        if current_skyline_app == 'mirage':
            do_not_use_sparse_data = True

        # @aded 20210309 - Feature #3978: luminosity - classify_metrics
        #                  Feature #3642: Anomaly type classification
        if current_skyline_app == 'luminosity':
            do_not_use_sparse_data = True

        if do_not_use_sparse_data:

            total_period = 0
            total_datapoints = 0
            try:
                start_timestamp = int(timeseries[0][0])
                end_timestamp = int(timeseries[-1][0])
                total_period = end_timestamp - start_timestamp
                total_datapoints = len(timeseries)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % (
                        algorithm_name, e))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error('error :: debug_logging :: %s :: failed to determine total_period and total_datapoints' % (
                        algorithm_name))
                timeseries = []
            if not timeseries:
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)

            if current_skyline_app == 'analyzer':
                # Default for analyzer at required period to 18 hours
                period_required = int(FULL_DURATION * 0.75)
            else:
                # Determine from timeseries
                if total_period < FULL_DURATION:
                    period_required = int(FULL_DURATION * 0.75)
                else:
                    period_required = int(total_period * 0.75)

            # If the time series does not have 75% of its full_duration it does not
            # have sufficient data to sample
            try:
                if total_period < period_required:
                    if debug_logging:
                        current_logger.debug('debug :: %s :: time series does not have sufficient data' % (
                            algorithm_name))
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    else:
                        return (anomalous, anomalyScore)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % (
                        algorithm_name, e))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error('error :: debug_logging :: %s :: falied to determine if time series has sufficient data' % (
                        algorithm_name))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)

            # If the time series does not have 75% of its full_duration data points
            # it does not have sufficient data to sample

            # Determine resolution from the last 30 data points
            # INFO took 0.002060 seconds
            resolution_timestamps = []
            metric_resolution = False
            for metric_datapoint in timeseries[-30:]:
                timestamp = int(metric_datapoint[0])
                resolution_timestamps.append(timestamp)
            timestamp_resolutions = []
            if resolution_timestamps:
                last_timestamp = None
                for timestamp in resolution_timestamps:
                    if last_timestamp:
                        resolution = timestamp - last_timestamp
                        timestamp_resolutions.append(resolution)
                        last_timestamp = timestamp
                    else:
                        last_timestamp = timestamp
                try:
                    del resolution_timestamps
                except:
                    pass
            if timestamp_resolutions:
                try:
                    timestamp_resolutions_count = Counter(timestamp_resolutions)
                    ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common()
                    metric_resolution = int(ordered_timestamp_resolutions_count[0][0])
                except SystemExit as e:
                    if debug_logging:
                        current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % (
                            algorithm_name, e))
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    else:
                        return (anomalous, anomalyScore)
                except:
                    traceback_msg = traceback.format_exc()
                    record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                    if debug_logging:
                        current_logger.error(traceback_msg)
                        current_logger.error('error :: debug_logging :: %s :: failed to determine if time series has sufficient data' % (
                            algorithm_name))
                try:
                    del timestamp_resolutions
                except:
                    pass
            minimum_datapoints = None
            if metric_resolution:
                minimum_datapoints = int(period_required / metric_resolution)
            if minimum_datapoints:
                if total_datapoints < minimum_datapoints:
                    if debug_logging:
                        current_logger.debug('debug :: %s :: time series does not have sufficient data, minimum_datapoints required is %s and time series has %s' % (
                            algorithm_name, str(minimum_datapoints),
                            str(total_datapoints)))
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    else:
                        return (anomalous, anomalyScore)

            # Is the time series fully populated?
            # full_duration_datapoints = int(full_duration / metric_resolution)
            total_period_datapoints = int(total_period / metric_resolution)
            # minimum_percentage_sparsity = 95
            minimum_percentage_sparsity = 90
            sparsity = int(total_datapoints / (total_period_datapoints / 100))
            if sparsity < minimum_percentage_sparsity:
                if debug_logging:
                    current_logger.debug('debug :: %s :: time series does not have sufficient data, minimum_percentage_sparsity required is %s and time series has %s' % (
                        algorithm_name, str(minimum_percentage_sparsity),
                        str(sparsity)))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)
            if len(set(item[1] for item in timeseries)) == 1:
                if debug_logging:
                    current_logger.debug('debug :: %s :: time series does not have sufficient variability, all the values are the same' % algorithm_name)
                anomalous = False
                anomalyScore = 0.0
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)

        end_preprocessing = timer()
        preprocessing_runtime = end_preprocessing - start_preprocessing
        if debug_logging:
            current_logger.debug('debug :: %s :: preprocessing took %.6f seconds' % (
                algorithm_name, preprocessing_runtime))

        if not timeseries:
            if debug_logging:
                current_logger.debug('debug :: %s :: LevelShiftAD not run as no data' % (
                    algorithm_name))
            anomalies = []
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)
        else:
            if debug_logging:
                current_logger.debug('debug :: %s :: timeseries length: %s' % (
                    algorithm_name, str(len(timeseries))))

        if len(timeseries) < 100:
            if debug_logging:
                current_logger.debug('debug :: %s :: time series does not have sufficient data' % (
                    algorithm_name))
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)

        start_analysis = timer()
        try:
            df = pd.DataFrame(timeseries, columns=['date', 'value'])
            df['date'] = pd.to_datetime(df['date'], unit='s')
            datetime_index = pd.DatetimeIndex(df['date'].values)
            df = df.set_index(datetime_index)
            df.drop('date', axis=1, inplace=True)
            s = validate_series(df)
            level_shift_ad = LevelShiftAD(c=c, side=side, window=window)
            anomaly_df = level_shift_ad.fit_detect(s)
            anomalies = anomaly_df.loc[anomaly_df['value'] > 0]
            anomalous = False
            if len(anomalies) > 0:
                anomaly_timestamps = list(anomalies.index.astype(np.int64) // 10**9)
                if realtime_analysis:
                    last_window_timestamps = [int(item[0]) for item in timeseries[-window:]]
                    # if timeseries[-1][0] in anomaly_timestamps:
                    for timestamp in last_window_timestamps:
                        if timestamp in anomaly_timestamps:
                            anomalous = True
                            break
                else:
                    anomalous = True
                    # Convert anomalies dataframe to anomalies_list
                    anomalies_list = []

                    # @added 20210316 - Feature #3978: luminosity - classify_metrics
                    #                   Feature #3642: Anomaly type classification
                    # Convert anomalies dataframe to anomalies_dict
                    anomalies_dict = {}
                    anomalies_dict['metric'] = base_name
                    anomalies_dict['timestamp'] = int(timeseries[-1][0])
                    anomalies_dict['from_timestamp'] = int(timeseries[0][0])
                    anomalies_dict['algorithm'] = algorithm_name
                    anomalies_dict['anomalies'] = {}

                    for ts, value in timeseries:
                        if int(ts) in anomaly_timestamps:
                            anomalies_list.append([int(ts), value])
                            anomalies_dict['anomalies'][int(ts)] = value
                    anomalies = list(anomalies_list)

                    # @added 20210316 - Feature #3978: luminosity - classify_metrics
                    #                   Feature #3642: Anomaly type classification
                    if save_plots_to:
                        try:
                            from adtk.visualization import plot
                            metric_dir = base_name.replace('.', '/')
                            timestamp_dir = str(int(timeseries[-1][0]))
                            save_path = '%s/%s/%s/%s' % (
                                save_plots_to, algorithm_name, metric_dir,
                                timestamp_dir)
                            if save_plots_to_absolute_dir:
                                save_path = '%s' % save_plots_to
                            anomalies_dict['file_path'] = save_path
                            save_to_file = '%s/%s.%s.png' % (
                                save_path, algorithm_name, base_name)
                            if filename_prefix:
                                save_to_file = '%s/%s.%s.%s.png' % (
                                    save_path, filename_prefix, algorithm_name,
                                    base_name)
                            save_to_path = os_path_dirname(save_to_file)
                            title = '%s\n%s' % (algorithm_name, base_name)
                            if not os_path_exists(save_to_path):
                                try:
                                    mkdir_p(save_to_path)
                                except Exception as e:
                                    current_logger.error('error :: %s :: failed to create dir - %s - %s' % (
                                        algorithm_name, save_to_path, e))
                            if os_path_exists(save_to_path):
                                try:
                                    plot(s, anomaly=anomaly_df, anomaly_color='red', title=title, save_to_file=save_to_file)
                                    if debug_logging:
                                        current_logger.debug('debug :: %s :: plot saved to - %s' % (
                                            algorithm_name, save_to_file))
                                except Exception as e:
                                    current_logger.error('error :: %s :: failed to plot - %s - %s' % (
                                        algorithm_name, base_name, e))
                            anomalies_file = '%s/%s.%s.anomalies_list.txt' % (
                                save_path, algorithm_name, base_name)
                            with open(anomalies_file, 'w') as fh:
                                fh.write(str(anomalies_list))
                                # os.chmod(anomalies_file, mode=0o644)
                            data_file = '%s/data.txt' % (save_path)
                            with open(data_file, 'w') as fh:
                                fh.write(str(anomalies_dict))
                        except SystemExit as e:
                            if debug_logging:
                                current_logger.debug('debug_logging :: %s :: SystemExit called during save plot, exiting - %s' % (
                                    algorithm_name, e))
                            if return_anomalies:
                                return (anomalous, anomalyScore, anomalies)
                            else:
                                return (anomalous, anomalyScore)
                        except Exception as e:
                            traceback_msg = traceback.format_exc()
                            record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                            if debug_logging:
                                current_logger.error(traceback_msg)
                                current_logger.error('error :: %s :: failed to plot or save anomalies file - %s - %s' % (
                                    algorithm_name, base_name, e))
            else:
                anomalies = []

            # @added 20210318 - Feature #3978: luminosity - classify_metrics
            #                   Feature #3642: Anomaly type classification
            if anomalies and run_PersistAD and not realtime_analysis:
                persist_ad_algorithm_parameters = {}
                try:
                    persist_ad_algorithm_parameters = algorithm_parameters['persist_ad_algorithm_parameters']
                except:
                    pass
                persist_ad_window = 20
                try:
                    persist_ad_window = persist_ad_algorithm_parameters['window']
                except:
                    pass
                persist_ad_c = 9.9
                try:
                    persist_ad_c = persist_ad_algorithm_parameters['c']
                except:
                    pass
                try:
                    from adtk.detector import PersistAD
                    persist_ad = PersistAD(c=persist_ad_c, side='both', window=persist_ad_window)
                    persist_ad_anomaly_df = persist_ad.fit_detect(s)
                    persist_ad_anomalies = persist_ad_anomaly_df.loc[persist_ad_anomaly_df['value'] > 0]
                    if len(persist_ad_anomalies) > 0:
                        current_logger.info('%s :: %s anomalies found with PersistAD on %s' % (
                            algorithm_name, str(len(persist_ad_anomalies)),
                            base_name))
                        persist_ad_anomaly_timestamps = list(persist_ad_anomalies.index.astype(np.int64) // 10**9)
                        # Convert persist_ad_anomalies dataframe to persist_ad_anomalies_list
                        persist_ad_anomalies_list = []
                        persist_ad_anomalies_dict = {}
                        persist_ad_anomalies_dict['metric'] = base_name
                        persist_ad_anomalies_dict['timestamp'] = int(timeseries[-1][0])
                        persist_ad_anomalies_dict['from_timestamp'] = int(timeseries[0][0])
                        persist_ad_anomalies_dict['algorithm'] = 'adtk_PersistAD'
                        persist_ad_anomalies_dict['anomalies'] = {}

                        for ts, value in timeseries:
                            if int(ts) in persist_ad_anomaly_timestamps:
                                persist_ad_anomalies_list.append([int(ts), value])
                                persist_ad_anomalies_dict['anomalies'][int(ts)] = value
                        persist_ad_anomalies = list(persist_ad_anomalies_list)
                        if save_plots_to:
                            try:
                                from adtk.visualization import plot
                                metric_dir = base_name.replace('.', '/')
                                timestamp_dir = str(int(timeseries[-1][0]))
                                save_path = '%s/%s/%s/%s' % (
                                    save_plots_to, algorithm_name, metric_dir,
                                    timestamp_dir)
                                if save_plots_to_absolute_dir:
                                    save_path = '%s' % save_plots_to
                                persist_ad_anomalies_dict['file_path'] = save_path
                                save_to_file = '%s/%s.PersistAD.%s.png' % (
                                    save_path, algorithm_name, base_name)
                                if filename_prefix:
                                    save_to_file = '%s/%s.%s.%s.png' % (
                                        save_path, filename_prefix, algorithm_name,
                                        base_name)
                                save_to_path = os_path_dirname(save_to_file)
                                title = '%s - PersistAD verification\n%s' % (algorithm_name, base_name)
                                if not os_path_exists(save_to_path):
                                    try:
                                        mkdir_p(save_to_path)
                                    except Exception as e:
                                        current_logger.error('error :: %s :: failed to create dir - %s - %s' % (
                                            algorithm_name, save_to_path, e))
                                if os_path_exists(save_to_path):
                                    try:
                                        plot(s, anomaly=persist_ad_anomaly_df, anomaly_color='red', title=title, save_to_file=save_to_file)
                                        if debug_logging:
                                            current_logger.debug('debug :: %s :: plot saved to - %s' % (
                                                algorithm_name, save_to_file))
                                    except Exception as e:
                                        current_logger.error('error :: %s :: failed to plot - %s - %s' % (
                                            algorithm_name, base_name, e))
                                anomalies_file = '%s/%s.%s.PersistAD.anomalies_list.txt' % (
                                    save_path, algorithm_name, base_name)
                                with open(anomalies_file, 'w') as fh:
                                    fh.write(str(persist_ad_anomalies))
                                    # os.chmod(anomalies_file, mode=0o644)
                                data_file = '%s/PersistAD.data.txt' % (save_path)
                                with open(data_file, 'w') as fh:
                                    fh.write(str(persist_ad_anomalies_dict))
                            except Exception as e:
                                traceback_msg = traceback.format_exc()
                                record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                                if debug_logging:
                                    current_logger.error(traceback_msg)
                                    current_logger.error('error :: %s :: failed to plot or save PersistAD anomalies file - %s - %s' % (
                                        algorithm_name, base_name, e))
                except Exception as e:
                    traceback_msg = traceback.format_exc()
                    record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                    if debug_logging:
                        current_logger.error(traceback_msg)
                        current_logger.error('error :: %s :: failed to analysis with PersistAD anomalies file - %s - %s' % (
                            algorithm_name, base_name, e))
            try:
                del df
            except:
                pass
        except SystemExit as e:
            if debug_logging:
                current_logger.debug('debug_logging :: %s :: SystemExit called, during analysis, exiting - %s' % (
                    algorithm_name, e))
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)
        except:
            traceback_msg = traceback.format_exc()
            record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
            if debug_logging:
                current_logger.error(traceback_msg)
                current_logger.error('error :: debug_logging :: %s :: failed to run on ts' % (
                    algorithm_name))
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)

        end_analysis = timer()
        analysis_runtime = end_analysis - start_analysis

        if debug_logging:
            current_logger.debug('debug :: %s :: LevelShiftAD took %.6f seconds' % (
                algorithm_name, analysis_runtime))

        if anomalous:
            anomalyScore = 1.0
        else:
            anomalyScore = 0.0

        if debug_logging:
            current_logger.info('%s :: anomalous - %s, anomalyScore - %s' % (
                algorithm_name, str(anomalous), str(anomalyScore)))

        if debug_logging:
            end = timer()
            processing_runtime = end - start
            current_logger.info('%s :: completed analysis in %.6f seconds' % (
                algorithm_name, processing_runtime))
        try:
            del timeseries
        except:
            pass
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        else:
            return (anomalous, anomalyScore)

    except SystemExit as e:
        if debug_logging:
            current_logger.debug('debug_logging :: %s :: SystemExit called (before StopIteration), exiting - %s' % (
                algorithm_name, e))
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        else:
            return (anomalous, anomalyScore)
    except StopIteration:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        if return_anomalies:
            return (False, None, anomalies)
        else:
            return (False, None)
    except:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        if return_anomalies:
            return (False, None, anomalies)
        else:
            return (False, None)

    if return_anomalies:
        return (anomalous, anomalyScore, anomalies)
    else:
        return (anomalous, anomalyScore)
示例#3
0
    def spin_process(self, i, run_timestamp, metric_check_file):
        """
        Assign a metric for a process to analyze.

        :param i: python process id
        :param run_timestamp: the epoch timestamp at which this process was called
        :param metric_check_file: full path to the metric check file

        :return: returns True

        """

        child_process_pid = os.getpid()
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('child_process_pid - %s' % str(child_process_pid))

        self.process_list.append(child_process_pid)

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('processing metric check - %s' % metric_check_file)

        if not os.path.isfile(str(metric_check_file)):
            logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file)))
            return

        check_file_name = os.path.basename(str(metric_check_file))
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_name - %s' % check_file_name)
        check_file_timestamp = check_file_name.split('.', 1)[0]
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_timestamp - %s' % str(check_file_timestamp))
        check_file_metricname_txt = check_file_name.split('.', 1)[1]
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname_txt - %s' % check_file_metricname_txt)
        check_file_metricname = check_file_metricname_txt.replace('.txt', '')
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname - %s' % check_file_metricname)
        check_file_metricname_dir = check_file_metricname.replace('.', '/')
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname_dir - %s' % check_file_metricname_dir)

        metric_failed_check_dir = failed_checks_dir + '/' + check_file_metricname_dir + '/' + check_file_timestamp
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('metric_failed_check_dir - %s' % metric_failed_check_dir)

        # failed_check_file = failed_checks_dir + '/' + check_file_name
        failed_check_file = metric_failed_check_dir + '/' + check_file_name

        # Load and validate metric variables
        try:
            metric_vars = load_metric_vars(skyline_app, str(metric_check_file))
        except:
            logger.error('error :: failed to import metric variables from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            # TBD - a failed check Panorama update will go here, perhaps txt
            #       files are not the only "queue" that will be used, both, but
            #       Panorama, may be just a part of Skyline Flux, the flux DB
            #       would allow for a very nice, distributed "queue" and a
            #       distributed Skyline workforce...
            #       Any Skyline node could just have one role, e.g. lots of
            #       Skyline nodes running crucible only and instead of reading
            #       the local filesystem for input, they could read the Flux DB
            #       queue or both...
            return

        # Test metric variables
        # We use a pythonic methodology to test if the variables are defined,
        # this ensures that if any of the variables are not set for some reason
        # we can handle unexpected data or situations gracefully and try and
        # ensure that the process does not hang.

        # if len(str(metric_vars.metric)) == 0:
        # if not metric_vars.metric:
        try:
            metric_vars.metric
        except:
            logger.error('error :: failed to read metric variable from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            metric = str(metric_vars.metric)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - metric - %s' % metric)

        # if len(metric_vars.value) == 0:
        # if not metric_vars.value:
        try:
            metric_vars.value
        except:
            logger.error('error :: failed to read value variable from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            value = str(metric_vars.value)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - value - %s' % (value))

        # if len(metric_vars.from_timestamp) == 0:
        # if not metric_vars.from_timestamp:
        try:
            metric_vars.from_timestamp
        except:
            logger.error('error :: failed to read from_timestamp variable from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            from_timestamp = str(metric_vars.from_timestamp)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - from_timestamp - %s' % from_timestamp)

        # if len(metric_vars.metric_timestamp) == 0:
        # if not metric_vars.metric_timestamp:
        try:
            metric_vars.metric_timestamp
        except:
            logger.error('error :: failed to read metric_timestamp variable from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            metric_timestamp = str(metric_vars.metric_timestamp)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - metric_timestamp - %s' % metric_timestamp)

        # if len(metric_vars.algorithms) == 0:
        # if not metric_vars.algorithms:
        try:
            metric_vars.algorithms
        except:
            logger.error('error :: failed to read algorithms variable from check file setting to all' % (metric_check_file))
            algorithms = ['all']
        else:
            algorithms = []
            for i_algorithm in metric_vars.algorithms:
                algorithms.append(i_algorithm)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - algorithms - %s' % algorithms)

        # if len(metric_vars.anomaly_dir) == 0:
        # if not metric_vars.anomaly_dir:
        try:
            metric_vars.anomaly_dir
        except:
            logger.error('error :: failed to read anomaly_dir variable from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            anomaly_dir = str(metric_vars.anomaly_dir)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - anomaly_dir - %s' % anomaly_dir)

        # if len(str(metric_vars.graphite_metric)) == 0:
        try:
            metric_vars.graphite_metric
        except:
            logger.info('failed to read graphite_metric variable from check file setting to False')
            # yes this is a string
            graphite_metric = 'False'
        else:
            graphite_metric = str(metric_vars.graphite_metric)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - graphite_metric - %s' % graphite_metric)

        # if len(str(metric_vars.run_crucible_tests)) == 0:
        try:
            metric_vars.run_crucible_tests
        except:
            logger.info('failed to read run_crucible_tests variable from check file setting to False')
            # yes this is a string
            run_crucible_tests = 'False'
        else:
            run_crucible_tests = str(metric_vars.run_crucible_tests)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - run_crucible_tests - %s' % run_crucible_tests)

        try:
            metric_vars.added_by
        except:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('failed to read added_by variable from check file setting to crucible - set to crucible')
            added_by = 'crucible'
        else:
            added_by = str(metric_vars.added_by)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - added_by - %s' % added_by)

        try:
            metric_vars.run_script
        except:
            run_script = False
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - run_script - not present set to False')
        else:
            run_script = str(metric_vars.run_script)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - run_script - %s' % run_script)

        # Only check if the metric does not a EXPIRATION_TIME key set, crucible
        # uses the alert EXPIRATION_TIME for the relevant alert setting contexts
        # whether that be analyzer, mirage, boundary, etc and sets its own
        # cache_keys in redis.  This prevents large amounts of data being added
        # in terms of tieseries json and image files, crucible samples at the
        # same EXPIRATION_TIME as alerts.

        source_app = 'crucible'
        expiration_timeout = 1800
        remove_all_anomaly_files = False
        check_expired = False
        check_time = time()

        if added_by == 'analyzer' or added_by == 'mirage':
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('Will check %s ALERTS' % added_by)
            if settings.ENABLE_ALERTS:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('Checking %s ALERTS' % added_by)
                for alert in settings.ALERTS:
                    ALERT_MATCH_PATTERN = alert[0]
                    METRIC_PATTERN = metric
                    alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                    pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                    if pattern_match:
                        source_app = added_by
                        expiration_timeout = alert[2]
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info('matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout)))
                        check_age = int(check_time) - int(metric_timestamp)
                        if int(check_age) > int(expiration_timeout):
                            check_expired = True
                            if settings.ENABLE_CRUCIBLE_DEBUG:
                                logger.info('the check is older than EXPIRATION_TIME for the metric - not checking - check_expired')

        if added_by == 'boundary':
            if settings.BOUNDARY_ENABLE_ALERTS:
                for alert in settings.BOUNDARY_METRICS:
                    ALERT_MATCH_PATTERN = alert[0]
                    METRIC_PATTERN = metric
                    alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                    pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                    if pattern_match:
                        source_app = 'boundary'
                        expiration_timeout = alert[2]
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info('matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout)))
                        check_age = int(check_time) - int(metric_timestamp)
                        if int(check_age) > int(expiration_timeout):
                            check_expired = True
                            if settings.ENABLE_CRUCIBLE_DEBUG:
                                logger.info('the check is older than EXPIRATION_TIME for the metric - not checking - check_expired')

        cache_key = 'crucible.last_check.%s.%s' % (source_app, metric)
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('cache_key - crucible.last_check.%s.%s' % (source_app, metric))

        # Only use the cache_key EXPIRATION_TIME if this is not a request to
        # run_crucible_tests on a timeseries
        if run_crucible_tests == 'False':
            if check_expired:
                logger.info('check_expired - not checking Redis key')
                last_check = True
            else:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('Checking if cache_key exists')
                try:
                    last_check = self.redis_conn.get(cache_key)
                except Exception as e:
                    logger.error('error :: could not query cache_key for %s - %s - %s' % (alerter, metric, e))
                    logger.info('all anomaly files will be removed')
                    remove_all_anomaly_files = True

            if not last_check:
                try:
                    self.redis_conn.setex(cache_key, expiration_timeout, packb(value))
                    logger.info('set cache_key for %s - %s with timeout of %s' % (source_app, metric, str(expiration_timeout)))
                except Exception as e:
                    logger.error('error :: could not query cache_key for %s - %s - %s' % (alerter, metric, e))
                    logger.info('all anomaly files will be removed')
                    remove_all_anomaly_files = True
            else:
                if check_expired:
                    logger.info('check_expired - all anomaly files will be removed')
                    remove_all_anomaly_files = True
                else:
                    logger.info('cache_key is set and not expired for %s - %s - all anomaly files will be removed' % (source_app, metric))
                    remove_all_anomaly_files = True

        # anomaly dir
        if not os.path.exists(str(anomaly_dir)):
            try:
                mkdir_p(skyline_app, str(anomaly_dir))
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('created anomaly dir - %s' % str(anomaly_dir))
            except:
                logger.error('error :: failed to create anomaly_dir - %s' % str(anomaly_dir))

        if not os.path.exists(str(anomaly_dir)):
            logger.error('error :: anomaly_dir does not exist')
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomaly dir exists - %s' % str(anomaly_dir))

        failed_check_file = anomaly_dir + '/' + metric_timestamp + '.failed.check.' + metric + '.txt'
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('failed_check_file - %s' % str(failed_check_file))

        # Retrieve data from graphite is necessary
        anomaly_graph = anomaly_dir + '/' + metric + '.png'
        anomaly_json = anomaly_dir + '/' + metric + '.json'
        anomaly_json_gz = anomaly_json + '.gz'
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('anomaly_graph - %s' % str(anomaly_graph))
            logger.info('anomaly_json - %s' % str(anomaly_json))
            logger.info('anomaly_json_gz - %s' % str(anomaly_json_gz))

        # Some things added to crucible may not be added by a skyline app per se
        # and if run_crucible_tests is string True then no anomaly files should
        # be removed.
        if run_crucible_tests == 'True':
            remove_all_anomaly_files = False

        # Remove check and anomaly files if the metric has a EXPIRATION_TIME
        # cache_key set
        if remove_all_anomaly_files:
            if os.path.isfile(anomaly_graph):
                try:
                    os.remove(anomaly_graph)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_graph removed - %s' % str(anomaly_graph))
                except OSError:
                    pass
            if os.path.isfile(anomaly_json):
                try:
                    os.remove(anomaly_json)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_json removed - %s' % str(anomaly_json))
                except OSError:
                    pass
            if os.path.isfile(anomaly_json_gz):
                try:
                    os.remove(anomaly_json_gz)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_json_gz removed - %s' % str(anomaly_json_gz))
                except OSError:
                    pass

            anomaly_txt_file = anomaly_dir + '/' + metric + '.txt'
            if os.path.isfile(anomaly_txt_file):
                try:
                    os.remove(anomaly_txt_file)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_txt_file removed - %s' % str(anomaly_txt_file))
                except OSError:
                    pass

            # TBD - this data would have to be added to the panaorama DB before
            #       it is removed
            if os.path.isfile(str(metric_check_file)):
                try:
                    os.remove(str(metric_check_file))
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('metric_check_file removed - %s' % str(metric_check_file))
                except OSError:
                    pass

            if os.path.exists(str(anomaly_dir)):
                try:
                    os.rmdir(str(anomaly_dir))
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_dir removed - %s' % str(anomaly_dir))
                except OSError:
                    pass

            logger.info('check and anomaly files removed')
            return

        # Check if the image exists
        if graphite_metric == 'True':

            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('graphite_metric - %s' % (graphite_metric))

            # Graphite timeouts
            connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('connect_timeout - %s' % str(connect_timeout))

            read_timeout = int(settings.GRAPHITE_READ_TIMEOUT)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('read_timeout - %s' % str(read_timeout))

            graphite_until = datetime.datetime.fromtimestamp(int(metric_timestamp)).strftime('%H:%M_%Y%m%d')
            graphite_from = datetime.datetime.fromtimestamp(int(from_timestamp)).strftime('%H:%M_%Y%m%d')

            # graphite URL
            if settings.GRAPHITE_PORT != '':
                url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + settings.GRAPHITE_PORT + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json'
            else:
                url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json'
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('graphite url - %s' % (url))

            if not os.path.isfile(anomaly_graph):
                logger.info('retrieving png - surfacing %s graph from graphite from %s to %s' % (metric, graphite_from, graphite_until))

                image_url = url.replace('&format=json', '')
                graphite_image_file = anomaly_dir + '/' + metric + '.png'
                if 'width' not in image_url:
                    image_url += '&width=586'
                if 'height' not in image_url:
                    image_url += '&height=308'
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('graphite image url - %s' % (image_url))
                image_url_timeout = int(connect_timeout)

                image_data = None

                try:
                    image_data = urllib2.urlopen(image_url, timeout=image_url_timeout).read()
                    logger.info('url OK - %s' % (image_url))
                except urllib2.URLError:
                    image_data = None
                    logger.error('error :: url bad - %s' % (image_url))

                if image_data is not None:
                    with open(graphite_image_file, 'w') as f:
                        f.write(image_data)
                    logger.info('retrieved - %s' % (anomaly_graph))
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(graphite_image_file, mode_arg)
                else:
                    logger.error('error :: failed to retrieved - %s' % (anomaly_graph))
            else:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('anomaly_graph file exists - %s' % str(anomaly_graph))

            if not os.path.isfile(anomaly_graph):
                logger.error('error :: retrieve failed to surface %s graph from graphite' % (metric))
            else:
                logger.info('graph image exists - %s' % (anomaly_graph))

            # Check if the json exists
            if not os.path.isfile(anomaly_json_gz):
                if not os.path.isfile(anomaly_json):
                    logger.info('surfacing timeseries data for %s from graphite from %s to %s' % (metric, graphite_from, graphite_until))
                    if requests.__version__ >= '2.4.0':
                        use_timeout = (int(connect_timeout), int(read_timeout))
                    else:
                        use_timeout = int(connect_timeout)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('use_timeout - %s' % (str(use_timeout)))

                    try:
                        r = requests.get(url, timeout=use_timeout)
                        js = r.json()
                        datapoints = js[0]['datapoints']
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info('data retrieved OK')
                    except:
                        datapoints = [[None, int(graphite_until)]]
                        logger.error('error :: data retrieval failed')

                    converted = []
                    for datapoint in datapoints:
                        try:
                            new_datapoint = [float(datapoint[1]), float(datapoint[0])]
                            converted.append(new_datapoint)
                        except:
                            continue

                    with open(anomaly_json, 'w') as f:
                        f.write(json.dumps(converted))
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(anomaly_json, mode_arg)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('json file - %s' % anomaly_json)

                if not os.path.isfile(anomaly_json):
                    logger.error('error :: failed to surface %s json from graphite' % (metric))
                    # Move metric check file
                    try:
                        shutil.move(metric_check_file, failed_check_file)
                        logger.info('moved check file to - %s' % failed_check_file)
                    except OSError:
                        logger.error('error :: failed to move check file to - %s' % failed_check_file)
                        pass
                    return

        # Check timeseries json exists - raw or gz
        if not os.path.isfile(anomaly_json):
            if not os.path.isfile(anomaly_json_gz):
                logger.error('error :: no json data found' % (metric))
                # Move metric check file
                try:
                    shutil.move(metric_check_file, failed_check_file)
                    logger.info('moved check file to - %s' % failed_check_file)
                except OSError:
                    logger.error('error :: failed to move check file to - %s' % failed_check_file)
                    pass
                return
            else:
                logger.info('timeseries json gzip exists - %s' % (anomaly_json_gz))
        else:
            logger.info('timeseries json exists - %s' % (anomaly_json))

        # If timeseries json and run_crucible_tests is str(False) gzip and
        # return here as there is nothing further to do
        if run_crucible_tests == 'False':
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('run_crucible_tests - %s' % run_crucible_tests)
            # gzip the json timeseries data
            if os.path.isfile(anomaly_json):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('gzipping - %s' % anomaly_json)
                try:
                    f_in = open(anomaly_json)
                    f_out = gzip.open(anomaly_json_gz, 'wb')
                    f_out.writelines(f_in)
                    f_out.close()
                    f_in.close()
                    os.remove(anomaly_json)
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(anomaly_json_gz, mode_arg)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('gzipped - %s' % anomaly_json_gz)
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' % metric_check_file)
                    except OSError:
                        pass
                    return
                except:
                    logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc()))
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' % metric_check_file)
                    except OSError:
                        pass
                    return

            if os.path.isfile(anomaly_json_gz):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('gzip exists - %s' % anomaly_json)
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' % metric_check_file)
                    except OSError:
                        pass
                    return
            nothing_to_do = 'true - for debug only'

        # self.check_if_parent_is_alive()
        # Run crucible algorithms
        logger.info('running crucible tests - %s' % (metric))

        timeseries_dir = metric.replace('.', '/')

        if os.path.isfile(anomaly_json_gz):
            if not os.path.isfile(anomaly_json):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('ungzipping - %s' % anomaly_json_gz)
                try:
                    # with gzip.open(anomaly_json_gz, 'rb') as fr:
                    fr = gzip.open(anomaly_json_gz, 'rb')
                    raw_timeseries = fr.read()
                    fr.close()
                except Exception as e:
                    logger.error('error :: could not ungzip %s - %s' % (anomaly_json_gz, e))
                    traceback.print_exc()
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('ungzipped')
                    logger.info('writing to - %s' % anomaly_json)
                with open(anomaly_json, 'w') as fw:
                    fw.write(raw_timeseries)
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('anomaly_json done')
                if python_version == 2:
                    mode_arg = int('0644')
                if python_version == 3:
                    mode_arg = '0o644'
                os.chmod(anomaly_json, mode_arg)
        else:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('No gzip - %s' % anomaly_json_gz)
            nothing_to_do = 'true - for debug only'

        if os.path.isfile(anomaly_json):
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomaly_json exists - %s' % anomaly_json)

        if os.path.isfile(anomaly_json):
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('loading timeseries from - %s' % anomaly_json)
            with open(anomaly_json, 'r') as f:
                timeseries = json.loads(f.read())
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('loaded timeseries from - %s' % anomaly_json)
        else:
            try:
                logger.error('error :: file not found - %s' % anomaly_json)
                shutil.move(metric_check_file, failed_check_file)
                if python_version == 2:
                    mode_arg = int('0644')
                if python_version == 3:
                    mode_arg = '0o644'
                os.chmod(failed_check_file, mode_arg)
                logger.info('moved check file to - %s' % failed_check_file)
            except OSError:
                logger.error('error :: failed to move check file to - %s' % failed_check_file)
                pass
            return

        start_timestamp = int(timeseries[0][0])
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('start_timestamp - %s' % str(start_timestamp))
        end_timestamp = int(timeseries[-1][0])
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('end_timestamp - %s' % str(end_timestamp))

        full_duration = end_timestamp - start_timestamp
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('full_duration - %s' % str(full_duration))

        self.check_if_parent_is_alive()

        run_algorithms_start_timestamp = int(time())
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('run_algorithms_start_timestamp - %s' % str(run_algorithms_start_timestamp))

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('run_algorithms - %s,%s,%s,%s,%s,%s' % (metric, str(end_timestamp), str(full_duration), anomaly_json, skyline_app, str(algorithms)))
        try:
            anomalous, ensemble = run_algorithms(timeseries, str(metric), end_timestamp, full_duration, str(anomaly_json), skyline_app, algorithms)
        except:
            logger.error('error :: run_algorithms failed - %s' % str(traceback.print_exc()))

        run_algorithms_end_timestamp = int(time())
        run_algorithms_seconds = run_algorithms_end_timestamp - run_algorithms_start_timestamp

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('anomalous, ensemble - %s, %s' % (anomalous, str(ensemble)))

        if anomalous:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomalous - %s' % (anomalous))
            nothing_to_do = 'true - for debug only'

        logger.info('run_algorithms took %s seconds' % str(run_algorithms_seconds))

        # Update anomaly file
        crucible_data = 'crucible_tests_run = "%s"\n' \
                        'crucible_triggered_algorithms = %s\n' \
                        'tested_by = "%s"\n' \
            % (str(run_timestamp), str(ensemble), str(this_host))
        crucible_anomaly_file = '%s/%s.txt' % (anomaly_dir, metric)
        with open(crucible_anomaly_file, 'a') as fh:
            fh.write(crucible_data)
        if python_version == 2:
            mode_arg = int('0644')
        if python_version == 3:
            mode_arg = '0o644'
        os.chmod(crucible_anomaly_file, mode_arg)
        logger.info('updated crucible anomaly file - %s/%s.txt' % (anomaly_dir, metric))

        # gzip the json timeseries data after analysis
        if os.path.isfile(anomaly_json):
            if not os.path.isfile(anomaly_json_gz):
                try:
                    f_in = open(anomaly_json)
                    f_out = gzip.open(anomaly_json_gz, 'wb')
                    f_out.writelines(f_in)
                    f_out.close()
                    f_in.close()
                    os.remove(anomaly_json)
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(anomaly_json_gz, mode_arg)
                    logger.info('gzipped - %s' % (anomaly_json_gz))
                except:
                    logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc()))
            else:
                os.remove(anomaly_json)

        if run_script:
            if os.path.isfile(run_script):
                logger.info('running - %s' % (run_script))
                os.system('%s %s' % (str(run_script), str(crucible_anomaly_file)))

        # Remove metric check file
        try:
            os.remove(metric_check_file)
            logger.info('complete removed check file - %s' % (metric_check_file))
        except OSError:
            pass
示例#4
0
    def run(self):
        """
        Called when the process intializes.
        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                os.remove(skyline_app_logwait)
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error - bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os.remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        logger.info('process intialized')

        while 1:
            now = time()
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('process started - %s' % int(now))

            # Make sure check_dir exists and has not been removed
            try:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('checking check dir exists - %s' %
                                settings.CRUCIBLE_CHECK_PATH)
                os.path.exists(settings.CRUCIBLE_CHECK_PATH)
            except:
                logger.error('error :: check dir did not exist - %s' %
                             settings.CRUCIBLE_CHECK_PATH)
                mkdir_p(settings.CRUCIBLE_CHECK_PATH)
                logger.info('check dir created - %s' %
                            settings.CRUCIBLE_CHECK_PATH)
                os.path.exists(settings.CRUCIBLE_CHECK_PATH)
                # continue

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
                logger.info('connected to redis at socket path %s' %
                            settings.REDIS_SOCKET_PATH)
            except:
                logger.info(
                    'skyline can not connect to redis at socket path %s' %
                    settings.REDIS_SOCKET_PATH)
                sleep(10)
                logger.info('connecting to redis at socket path %s' %
                            settings.REDIS_SOCKET_PATH)
                # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
                if settings.REDIS_PASSWORD:
                    self.redis_conn = StrictRedis(
                        password=settings.REDIS_PASSWORD,
                        unix_socket_path=settings.REDIS_SOCKET_PATH)
                else:
                    self.redis_conn = StrictRedis(
                        unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue
            """
            Determine if any metric has been added to test
            """
            while True:

                # Report app up
                self.redis_conn.setex(skyline_app, 120, now)

                metric_var_files = [
                    f for f in listdir(settings.CRUCIBLE_CHECK_PATH)
                    if isfile(join(settings.CRUCIBLE_CHECK_PATH, f))
                ]
                #                if len(metric_var_files) == 0:
                if not metric_var_files:
                    logger.info('sleeping 10 no metric check files')
                    sleep(10)

                # Discover metric to analyze
                metric_var_files = ''
                metric_var_files = [
                    f for f in listdir(settings.CRUCIBLE_CHECK_PATH)
                    if isfile(join(settings.CRUCIBLE_CHECK_PATH, f))
                ]
                #                if len(metric_var_files) > 0:
                if metric_var_files:
                    break

            metric_var_files_sorted = sorted(metric_var_files)
            metric_check_file = settings.CRUCIBLE_CHECK_PATH + "/" + str(
                metric_var_files_sorted[0])

            logger.info('assigning check for processing - %s' %
                        str(metric_var_files_sorted[0]))

            # Reset process_list
            try:
                self.process_list[:] = []
            except:
                logger.error('error :: failed to reset self.process_list')

            # Spawn processes
            pids = []
            spawned_pids = []
            pid_count = 0
            run_timestamp = int(now)
            for i in range(1, settings.CRUCIBLE_PROCESSES + 1):
                p = Process(target=self.spin_process,
                            args=(i, run_timestamp, str(metric_check_file)))
                pids.append(p)
                pid_count += 1
                logger.info('starting %s of %s spin_process/es' %
                            (str(pid_count), str(settings.CRUCIBLE_PROCESSES)))
                p.start()
                spawned_pids.append(p.pid)

            # Send wait signal to zombie processes
            # for p in pids:
            #     p.join()
            # Self monitor processes and terminate if any spin_process has run
            # for longer than CRUCIBLE_TESTS_TIMEOUT
            p_starts = time()
            while time() - p_starts <= settings.CRUCIBLE_TESTS_TIMEOUT:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info(
                        '%s :: %s spin_process/es completed in %.2f seconds' %
                        (skyline_app, str(
                            settings.CRUCIBLE_PROCESSES), time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info(
                    '%s :: timed out, killing all spin_process processes' %
                    (skyline_app))
                for p in pids:
                    p.terminate()
                    # p.join()

            for p in pids:
                if p.is_alive():
                    logger.info('%s :: stopping spin_process - %s' %
                                (skyline_app, str(p.is_alive())))
                    p.join()

            while os.path.isfile(metric_check_file):
                sleep(1)
示例#5
0
    def spin_process(self, i, run_timestamp, metric_check_file):
        """
        Assign a metric for a process to analyze.

        :param i: python process id
        :param run_timestamp: the epoch timestamp at which this process was called
        :param metric_check_file: full path to the metric check file

        :return: returns True

        """

        child_process_pid = os.getpid()
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('child_process_pid - %s' % str(child_process_pid))

        self.process_list.append(child_process_pid)

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('processing metric check - %s' % metric_check_file)

        if not os.path.isfile(str(metric_check_file)):
            logger.error('error :: file not found - metric_check_file - %s' %
                         (str(metric_check_file)))
            return

        check_file_name = os.path.basename(str(metric_check_file))
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_name - %s' % check_file_name)
        check_file_timestamp = check_file_name.split('.', 1)[0]
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_timestamp - %s' %
                        str(check_file_timestamp))
        check_file_metricname_txt = check_file_name.split('.', 1)[1]
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname_txt - %s' %
                        check_file_metricname_txt)
        check_file_metricname = check_file_metricname_txt.replace('.txt', '')
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname - %s' % check_file_metricname)
        check_file_metricname_dir = check_file_metricname.replace('.', '/')
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname_dir - %s' %
                        check_file_metricname_dir)

        metric_failed_check_dir = failed_checks_dir + '/' + check_file_metricname_dir + '/' + check_file_timestamp
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('metric_failed_check_dir - %s' %
                        metric_failed_check_dir)

        # failed_check_file = failed_checks_dir + '/' + check_file_name
        failed_check_file = metric_failed_check_dir + '/' + check_file_name

        # Load and validate metric variables
        try:
            metric_vars = load_metric_vars(skyline_app, str(metric_check_file))
        except:
            logger.error(
                'error :: failed to import metric variables from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            # TBD - a failed check Panorama update will go here, perhaps txt
            #       files are not the only "queue" that will be used, both, but
            #       Panorama, may be just a part of Skyline Flux, the flux DB
            #       would allow for a very nice, distributed "queue" and a
            #       distributed Skyline workforce...
            #       Any Skyline node could just have one role, e.g. lots of
            #       Skyline nodes running crucible only and instead of reading
            #       the local filesystem for input, they could read the Flux DB
            #       queue or both...
            return

        # Test metric variables
        # We use a pythonic methodology to test if the variables are defined,
        # this ensures that if any of the variables are not set for some reason
        # we can handle unexpected data or situations gracefully and try and
        # ensure that the process does not hang.

        # if len(str(metric_vars.metric)) == 0:
        # if not metric_vars.metric:
        try:
            metric_vars.metric
        except:
            logger.error(
                'error :: failed to read metric variable from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            metric = str(metric_vars.metric)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - metric - %s' % metric)

        # if len(metric_vars.value) == 0:
        # if not metric_vars.value:
        try:
            metric_vars.value
        except:
            logger.error(
                'error :: failed to read value variable from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            value = str(metric_vars.value)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - value - %s' % (value))

        # if len(metric_vars.from_timestamp) == 0:
        # if not metric_vars.from_timestamp:
        try:
            metric_vars.from_timestamp
        except:
            logger.error(
                'error :: failed to read from_timestamp variable from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            from_timestamp = str(metric_vars.from_timestamp)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - from_timestamp - %s' %
                            from_timestamp)

        # if len(metric_vars.metric_timestamp) == 0:
        # if not metric_vars.metric_timestamp:
        try:
            metric_vars.metric_timestamp
        except:
            logger.error(
                'error :: failed to read metric_timestamp variable from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            metric_timestamp = str(metric_vars.metric_timestamp)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - metric_timestamp - %s' %
                            metric_timestamp)

        # if len(metric_vars.algorithms) == 0:
        # if not metric_vars.algorithms:
        algorithms = []
        try:
            metric_vars.algorithms
        except:
            logger.error(
                'error :: failed to read algorithms variable from check file setting to all'
            )
            algorithms = ['all']
#        if not algorithms:
#            algorithms = []
#            for i_algorithm in metric_vars.algorithms:
#                algorithms.append(i_algorithm)
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('metric variable - algorithms - %s' % str(algorithms))

        # if len(metric_vars.anomaly_dir) == 0:
        # if not metric_vars.anomaly_dir:
        try:
            metric_vars.anomaly_dir
        except:
            logger.error(
                'error :: failed to read anomaly_dir variable from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            anomaly_dir = str(metric_vars.anomaly_dir)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - anomaly_dir - %s' % anomaly_dir)

        # if len(str(metric_vars.graphite_metric)) == 0:
        try:
            metric_vars.graphite_metric
        except:
            logger.info(
                'failed to read graphite_metric variable from check file setting to False'
            )
            # yes this is a string
            graphite_metric = 'False'
        else:
            graphite_metric = str(metric_vars.graphite_metric)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - graphite_metric - %s' %
                            graphite_metric)

        # if len(str(metric_vars.run_crucible_tests)) == 0:
        try:
            metric_vars.run_crucible_tests
        except:
            logger.info(
                'failed to read run_crucible_tests variable from check file setting to False'
            )
            # yes this is a string
            run_crucible_tests = 'False'
        else:
            run_crucible_tests = str(metric_vars.run_crucible_tests)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - run_crucible_tests - %s' %
                            run_crucible_tests)

        try:
            metric_vars.added_by
        except:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info(
                    'failed to read added_by variable from check file setting to crucible - set to crucible'
                )
            added_by = 'crucible'
        else:
            added_by = str(metric_vars.added_by)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - added_by - %s' % added_by)

        try:
            metric_vars.run_script
        except:
            run_script = False
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info(
                    'metric variable - run_script - not present set to False')
        else:
            run_script = str(metric_vars.run_script)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - run_script - %s' % run_script)

        # Only check if the metric does not a EXPIRATION_TIME key set, crucible
        # uses the alert EXPIRATION_TIME for the relevant alert setting contexts
        # whether that be analyzer, mirage, boundary, etc and sets its own
        # cache_keys in redis.  This prevents large amounts of data being added
        # in terms of tieseries json and image files, crucible samples at the
        # same EXPIRATION_TIME as alerts.

        source_app = 'crucible'
        expiration_timeout = 1800
        remove_all_anomaly_files = False
        check_expired = False
        check_time = time()

        if added_by == 'analyzer' or added_by == 'mirage':
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('Will check %s ALERTS' % added_by)
            if settings.ENABLE_ALERTS:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('Checking %s ALERTS' % added_by)
                for alert in settings.ALERTS:
                    ALERT_MATCH_PATTERN = alert[0]
                    METRIC_PATTERN = metric
                    alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                    pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                    if pattern_match:
                        source_app = added_by
                        expiration_timeout = alert[2]
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info(
                                'matched - %s - %s - EXPIRATION_TIME is %s' %
                                (source_app, metric, str(expiration_timeout)))
                        check_age = int(check_time) - int(metric_timestamp)
                        if int(check_age) > int(expiration_timeout):
                            check_expired = True
                            if settings.ENABLE_CRUCIBLE_DEBUG:
                                logger.info(
                                    'the check is older than EXPIRATION_TIME for the metric - not checking - check_expired'
                                )

        if added_by == 'boundary':
            if settings.BOUNDARY_ENABLE_ALERTS:
                for alert in settings.BOUNDARY_METRICS:
                    ALERT_MATCH_PATTERN = alert[0]
                    METRIC_PATTERN = metric
                    alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                    pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                    if pattern_match:
                        source_app = 'boundary'
                        expiration_timeout = alert[2]
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info(
                                'matched - %s - %s - EXPIRATION_TIME is %s' %
                                (source_app, metric, str(expiration_timeout)))
                        check_age = int(check_time) - int(metric_timestamp)
                        if int(check_age) > int(expiration_timeout):
                            check_expired = True
                            if settings.ENABLE_CRUCIBLE_DEBUG:
                                logger.info(
                                    'the check is older than EXPIRATION_TIME for the metric - not checking - check_expired'
                                )

        cache_key = 'crucible.last_check.%s.%s' % (source_app, metric)
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('cache_key - crucible.last_check.%s.%s' %
                        (source_app, metric))

        # Only use the cache_key EXPIRATION_TIME if this is not a request to
        # run_crucible_tests on a timeseries
        if run_crucible_tests == 'False':
            if check_expired:
                logger.info('check_expired - not checking Redis key')
                last_check = True
            else:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('Checking if cache_key exists')
                try:
                    last_check = self.redis_conn.get(cache_key)
                except Exception as e:
                    logger.error(
                        'error :: could not query cache_key for %s - %s - %s' %
                        (source_app, metric, e))
                    logger.info('all anomaly files will be removed')
                    remove_all_anomaly_files = True

            if not last_check:
                try:
                    self.redis_conn.setex(cache_key, expiration_timeout,
                                          packb(value))
                    logger.info(
                        'set cache_key for %s - %s with timeout of %s' %
                        (source_app, metric, str(expiration_timeout)))
                except Exception as e:
                    logger.error(
                        'error :: could not query cache_key for %s - %s - %s' %
                        (source_app, metric, e))
                    logger.info('all anomaly files will be removed')
                    remove_all_anomaly_files = True
            else:
                if check_expired:
                    logger.info(
                        'check_expired - all anomaly files will be removed')
                    remove_all_anomaly_files = True
                else:
                    logger.info(
                        'cache_key is set and not expired for %s - %s - all anomaly files will be removed'
                        % (source_app, metric))
                    remove_all_anomaly_files = True

        # anomaly dir
        if not os.path.exists(str(anomaly_dir)):
            try:
                # mkdir_p(skyline_app, str(anomaly_dir))
                mkdir_p(anomaly_dir)
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('created anomaly dir - %s' % str(anomaly_dir))
            except:
                logger.error('error :: failed to create anomaly_dir - %s' %
                             str(anomaly_dir))

        if not os.path.exists(str(anomaly_dir)):
            logger.error('error :: anomaly_dir does not exist')
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomaly dir exists - %s' % str(anomaly_dir))

        failed_check_file = anomaly_dir + '/' + metric_timestamp + '.failed.check.' + metric + '.txt'
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('failed_check_file - %s' % str(failed_check_file))

        # Retrieve data from graphite is necessary
        anomaly_graph = anomaly_dir + '/' + metric + '.png'
        anomaly_json = anomaly_dir + '/' + metric + '.json'
        anomaly_json_gz = anomaly_json + '.gz'
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('anomaly_graph - %s' % str(anomaly_graph))
            logger.info('anomaly_json - %s' % str(anomaly_json))
            logger.info('anomaly_json_gz - %s' % str(anomaly_json_gz))

        # Some things added to crucible may not be added by a skyline app per se
        # and if run_crucible_tests is string True then no anomaly files should
        # be removed.
        if run_crucible_tests == 'True':
            remove_all_anomaly_files = False

        # Remove check and anomaly files if the metric has a EXPIRATION_TIME
        # cache_key set
        if remove_all_anomaly_files:
            if os.path.isfile(anomaly_graph):
                try:
                    os.remove(anomaly_graph)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_graph removed - %s' %
                                    str(anomaly_graph))
                except OSError:
                    pass
            if os.path.isfile(anomaly_json):
                try:
                    os.remove(anomaly_json)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_json removed - %s' %
                                    str(anomaly_json))
                except OSError:
                    pass
            if os.path.isfile(anomaly_json_gz):
                try:
                    os.remove(anomaly_json_gz)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_json_gz removed - %s' %
                                    str(anomaly_json_gz))
                except OSError:
                    pass

            anomaly_txt_file = anomaly_dir + '/' + metric + '.txt'
            if os.path.isfile(anomaly_txt_file):
                try:
                    os.remove(anomaly_txt_file)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_txt_file removed - %s' %
                                    str(anomaly_txt_file))
                except OSError:
                    pass

            # TBD - this data would have to be added to the panaorama DB before
            #       it is removed
            if os.path.isfile(str(metric_check_file)):
                try:
                    os.remove(str(metric_check_file))
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('metric_check_file removed - %s' %
                                    str(metric_check_file))
                except OSError:
                    pass

            if os.path.exists(str(anomaly_dir)):
                try:
                    os.rmdir(str(anomaly_dir))
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_dir removed - %s' %
                                    str(anomaly_dir))
                except OSError:
                    pass

            logger.info('check and anomaly files removed')
            return

        # Check if the image exists
        if graphite_metric == 'True':

            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('graphite_metric - %s' % (graphite_metric))

            # Graphite timeouts
            connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('connect_timeout - %s' % str(connect_timeout))

            read_timeout = int(settings.GRAPHITE_READ_TIMEOUT)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('read_timeout - %s' % str(read_timeout))

            graphite_until = datetime.datetime.fromtimestamp(
                int(metric_timestamp)).strftime('%H:%M_%Y%m%d')
            graphite_from = datetime.datetime.fromtimestamp(
                int(from_timestamp)).strftime('%H:%M_%Y%m%d')

            # graphite URL
            if settings.GRAPHITE_PORT != '':
                url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + settings.GRAPHITE_PORT + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json'
            else:
                url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json'
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('graphite url - %s' % (url))

            if not os.path.isfile(anomaly_graph):
                logger.info(
                    'retrieving png - surfacing %s graph from graphite from %s to %s'
                    % (metric, graphite_from, graphite_until))

                image_url = url.replace('&format=json', '')
                graphite_image_file = anomaly_dir + '/' + metric + '.png'
                if 'width' not in image_url:
                    image_url += '&width=586'
                if 'height' not in image_url:
                    image_url += '&height=308'
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('graphite image url - %s' % (image_url))
                image_url_timeout = int(connect_timeout)

                image_data = None

                try:
                    # @modified 20170913 - Task #2160: Test skyline with bandit
                    # Added nosec to exclude from bandit tests
                    image_data = urllib2.urlopen(
                        image_url, timeout=image_url_timeout).read()  # nosec
                    logger.info('url OK - %s' % (image_url))
                except urllib2.URLError:
                    image_data = None
                    logger.error('error :: url bad - %s' % (image_url))

                if image_data is not None:
                    with open(graphite_image_file, 'w') as f:
                        f.write(image_data)
                    logger.info('retrieved - %s' % (anomaly_graph))
                    if python_version == 2:
                        os.chmod(graphite_image_file, 0644)
                    if python_version == 3:
                        os.chmod(graphite_image_file, mode=0o644)
                else:
                    logger.error('error :: failed to retrieved - %s' %
                                 (anomaly_graph))
            else:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('anomaly_graph file exists - %s' %
                                str(anomaly_graph))

            if not os.path.isfile(anomaly_graph):
                logger.error(
                    'error :: retrieve failed to surface %s graph from graphite'
                    % (metric))
            else:
                logger.info('graph image exists - %s' % (anomaly_graph))

            # Check if the json exists
            if not os.path.isfile(anomaly_json_gz):
                if not os.path.isfile(anomaly_json):
                    logger.info(
                        'surfacing timeseries data for %s from graphite from %s to %s'
                        % (metric, graphite_from, graphite_until))
                    if requests.__version__ >= '2.4.0':
                        use_timeout = (int(connect_timeout), int(read_timeout))
                    else:
                        use_timeout = int(connect_timeout)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('use_timeout - %s' % (str(use_timeout)))

                    try:
                        r = requests.get(url, timeout=use_timeout)
                        js = r.json()
                        datapoints = js[0]['datapoints']
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info('data retrieved OK')
                    except:
                        datapoints = [[None, int(graphite_until)]]
                        logger.error('error :: data retrieval failed')

                    converted = []
                    for datapoint in datapoints:
                        try:
                            new_datapoint = [
                                float(datapoint[1]),
                                float(datapoint[0])
                            ]
                            converted.append(new_datapoint)
                        # @modified 20170913 - Task #2160: Test skyline with bandit
                        # Added nosec to exclude from bandit tests
                        except:  # nosec
                            continue

                    with open(anomaly_json, 'w') as f:
                        f.write(json.dumps(converted))
                    if python_version == 2:
                        os.chmod(anomaly_json, 0644)
                    if python_version == 3:
                        os.chmod(anomaly_json, mode=0o644)

                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('json file - %s' % anomaly_json)

                if not os.path.isfile(anomaly_json):
                    logger.error(
                        'error :: failed to surface %s json from graphite' %
                        (metric))
                    # Move metric check file
                    try:
                        shutil.move(metric_check_file, failed_check_file)
                        logger.info('moved check file to - %s' %
                                    failed_check_file)
                    except OSError:
                        logger.error(
                            'error :: failed to move check file to - %s' %
                            failed_check_file)
                        pass
                    return

        # Check timeseries json exists - raw or gz
        if not os.path.isfile(anomaly_json):
            if not os.path.isfile(anomaly_json_gz):
                logger.error('error :: no json data found')
                # Move metric check file
                try:
                    shutil.move(metric_check_file, failed_check_file)
                    logger.info('moved check file to - %s' % failed_check_file)
                except OSError:
                    logger.error('error :: failed to move check file to - %s' %
                                 failed_check_file)
                    pass
                return
            else:
                logger.info('timeseries json gzip exists - %s' %
                            (anomaly_json_gz))
        else:
            logger.info('timeseries json exists - %s' % (anomaly_json))

        # If timeseries json and run_crucible_tests is str(False) gzip and
        # return here as there is nothing further to do
        if run_crucible_tests == 'False':
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('run_crucible_tests - %s' % run_crucible_tests)
            # gzip the json timeseries data
            if os.path.isfile(anomaly_json):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('gzipping - %s' % anomaly_json)
                try:
                    f_in = open(anomaly_json)
                    f_out = gzip.open(anomaly_json_gz, 'wb')
                    f_out.writelines(f_in)
                    f_out.close()
                    f_in.close()
                    os.remove(anomaly_json)
                    if python_version == 2:
                        os.chmod(anomaly_json_gz, 0644)
                    if python_version == 3:
                        os.chmod(anomaly_json_gz, mode=0o644)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('gzipped - %s' % anomaly_json_gz)
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' %
                                    metric_check_file)
                    except OSError:
                        pass
                    return
                except:
                    logger.error('error :: Failed to gzip data file - %s' %
                                 str(traceback.print_exc()))
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' %
                                    metric_check_file)
                    except OSError:
                        pass
                    return

            if os.path.isfile(anomaly_json_gz):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('gzip exists - %s' % anomaly_json)
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' %
                                    metric_check_file)
                    except OSError:
                        pass
                    return
            nothing_to_do = 'true - for debug only'

        # self.check_if_parent_is_alive()
        # Run crucible algorithms
        logger.info('running crucible tests - %s' % (metric))

        if os.path.isfile(anomaly_json_gz):
            if not os.path.isfile(anomaly_json):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('ungzipping - %s' % anomaly_json_gz)
                try:
                    # with gzip.open(anomaly_json_gz, 'rb') as fr:
                    fr = gzip.open(anomaly_json_gz, 'rb')
                    raw_timeseries = fr.read()
                    fr.close()
                except Exception as e:
                    logger.error('error :: could not ungzip %s - %s' %
                                 (anomaly_json_gz, e))
                    traceback.print_exc()
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('ungzipped')
                    logger.info('writing to - %s' % anomaly_json)
                with open(anomaly_json, 'w') as fw:
                    fw.write(raw_timeseries)
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('anomaly_json done')
                if python_version == 2:
                    os.chmod(anomaly_json, 0644)
                if python_version == 3:
                    os.chmod(anomaly_json, mode=0o644)
        else:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('No gzip - %s' % anomaly_json_gz)
            nothing_to_do = 'true - for debug only'

        if os.path.isfile(anomaly_json):
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomaly_json exists - %s' % anomaly_json)

        if os.path.isfile(anomaly_json):
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('loading timeseries from - %s' % anomaly_json)
            timeseries = None
            try:
                with open(anomaly_json, 'r') as f:
                    timeseries = json.loads(f.read())
                    raw_timeseries = f.read()
                    timeseries_array_str = str(raw_timeseries).replace(
                        '(', '[').replace(')', ']')
                    timeseries = literal_eval(timeseries_array_str)
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('loaded time series from - %s' % anomaly_json)
            except:
                # logger.error(traceback.format_exc())
                logger.info(
                    'failed to load with JSON, literal_eval will be tried - %s'
                    % anomaly_json)
            # @added 20180715 - Task #2444: Evaluate CAD
            #                   Task #2446: Optimize Ionosphere
            #                   Branch #2270: luminosity
            # If the json.loads fails use literal_eval
            if not timeseries:
                try:
                    with open(anomaly_json, 'r') as f:
                        raw_timeseries = f.read()
                        timeseries_array_str = str(raw_timeseries).replace(
                            '(', '[').replace(')', ']')
                        timeseries = literal_eval(timeseries_array_str)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info(
                            'loaded time series with literal_eval from - %s' %
                            anomaly_json)
                except:
                    logger.info(traceback.format_exc())
                    logger.error('error :: failed to load JSON - %s' %
                                 anomaly_json)
        else:
            try:
                logger.error('error :: file not found - %s' % anomaly_json)
                shutil.move(metric_check_file, failed_check_file)
                if python_version == 2:
                    os.chmod(failed_check_file, 0644)
                if python_version == 3:
                    os.chmod(failed_check_file, mode=0o644)
                logger.info('moved check file to - %s' % failed_check_file)
            except OSError:
                logger.error('error :: failed to move check file to - %s' %
                             failed_check_file)
                pass
            return

        start_timestamp = int(timeseries[0][0])
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('start_timestamp - %s' % str(start_timestamp))
        end_timestamp = int(timeseries[-1][0])
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('end_timestamp - %s' % str(end_timestamp))

        full_duration = end_timestamp - start_timestamp
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('full_duration - %s' % str(full_duration))

        self.check_if_parent_is_alive()

        run_algorithms_start_timestamp = int(time())
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('run_algorithms_start_timestamp - %s' %
                        str(run_algorithms_start_timestamp))

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('run_algorithms - %s,%s,%s,%s,%s,%s' %
                        (metric, str(end_timestamp), str(full_duration),
                         anomaly_json, skyline_app, str(algorithms)))
        try:
            anomalous, ensemble = run_algorithms(timeseries, str(metric),
                                                 end_timestamp, full_duration,
                                                 str(anomaly_json),
                                                 skyline_app, algorithms)
        except:
            logger.error('error :: run_algorithms failed - %s' %
                         str(traceback.print_exc()))

        run_algorithms_end_timestamp = int(time())
        run_algorithms_seconds = run_algorithms_end_timestamp - run_algorithms_start_timestamp

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('anomalous, ensemble - %s, %s' %
                        (anomalous, str(ensemble)))

        if anomalous:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomalous - %s' % (anomalous))
            nothing_to_do = 'true - for debug only'

        logger.info('run_algorithms took %s seconds' %
                    str(run_algorithms_seconds))

        # Update anomaly file
        crucible_data = 'crucible_tests_run = "%s"\n' \
                        'crucible_triggered_algorithms = %s\n' \
                        'tested_by = "%s"\n' \
            % (str(run_timestamp), str(ensemble), str(this_host))
        crucible_anomaly_file = '%s/%s.txt' % (anomaly_dir, metric)
        with open(crucible_anomaly_file, 'a') as fh:
            fh.write(crucible_data)
        if python_version == 2:
            os.chmod(crucible_anomaly_file, 0644)
        if python_version == 3:
            os.chmod(crucible_anomaly_file, mode=0o644)
        logger.info('updated crucible anomaly file - %s/%s.txt' %
                    (anomaly_dir, metric))

        # gzip the json timeseries data after analysis
        if os.path.isfile(anomaly_json):
            if not os.path.isfile(anomaly_json_gz):
                try:
                    f_in = open(anomaly_json)
                    f_out = gzip.open(anomaly_json_gz, 'wb')
                    f_out.writelines(f_in)
                    f_out.close()
                    f_in.close()
                    os.remove(anomaly_json)
                    if python_version == 2:
                        os.chmod(anomaly_json_gz, 0644)
                    if python_version == 3:
                        os.chmod(anomaly_json_gz, mode=0o644)
                    logger.info('gzipped - %s' % (anomaly_json_gz))
                except:
                    logger.error('error :: Failed to gzip data file - %s' %
                                 str(traceback.print_exc()))
            else:
                os.remove(anomaly_json)

        if run_script:
            if os.path.isfile(run_script):
                logger.info('running - %s' % (run_script))
                # @modified 20170913 - Task #2160: Test skyline with bandit
                # Added nosec to exclude from bandit tests
                os.system(
                    '%s %s' %
                    (str(run_script), str(crucible_anomaly_file)))  # nosec

        # Remove metric check file
        nothing_to_do = ''

        try:
            os.remove(metric_check_file)
            logger.info('complete removed check file - %s %s' %
                        (metric_check_file, nothing_to_do))
        except OSError:
            pass
示例#6
0
文件: alerters.py 项目: 40a/skyline
def alert_smtp(alert, metric, context):
    """
    Called by :func:`~trigger_alert` and sends an alert via smtp to the
    recipients that are configured for the metric.

    """
    LOCAL_DEBUG = False
    logger = logging.getLogger(skyline_app_logger)
    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
        logger.info('debug :: alert_smtp - sending smtp alert')
        logger.info('debug :: alert_smtp - Memory usage at start: %s (kb)' %
                    resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    # FULL_DURATION to hours so that analyzer surfaces the relevant timeseries data
    # in the graph
    full_duration_in_hours = int(settings.FULL_DURATION) / 3600

    # @added 20161229 - Feature #1830: Ionosphere alerts
    # Added Ionosphere variables
    base_name = str(metric[1]).replace(settings.FULL_NAMESPACE, '', 1)
    if settings.IONOSPHERE_ENABLED:
        timeseries_dir = base_name.replace('.', '/')
        training_data_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER,
                                          str(int(metric[2])), timeseries_dir)
        graphite_image_file = '%s/%s.%s.graphite.%sh.png' % (
            training_data_dir, base_name, skyline_app,
            str(int(full_duration_in_hours)))
        json_file = '%s/%s.%s.redis.%sh.json' % (
            training_data_dir, base_name, skyline_app,
            str(int(full_duration_in_hours)))
        training_data_redis_image = '%s/%s.%s.redis.plot.%sh.png' % (
            training_data_dir, base_name, skyline_app,
            str(int(full_duration_in_hours)))

    # For backwards compatibility
    if '@' in alert[1]:
        sender = settings.ALERT_SENDER
        recipient = alert[1]
    else:
        sender = settings.SMTP_OPTS['sender']
        # @modified 20160806 - Added default_recipient
        try:
            recipients = settings.SMTP_OPTS['recipients'][alert[0]]
            use_default_recipient = False
        except:
            use_default_recipient = True
        if use_default_recipient:
            try:
                recipients = settings.SMTP_OPTS['default_recipient']
                logger.info(
                    'alert_smtp - using default_recipient as no recipients are configured for %s'
                    % str(alert[0]))
            except:
                logger.error(
                    'error :: alert_smtp - no known recipient for %s' %
                    str(alert[0]))
                return False

    # Backwards compatibility
    if type(recipients) is str:
        recipients = [recipients]

    # @added 20180524 - Task #2384: Change alerters to cc other recipients
    # The alerters did send an individual email to each recipient. This would be
    # more useful if one email was sent with the first smtp recipient being the
    # to recipient and the subsequent recipients were add in cc.
    if recipients:
        primary_recipient = False
        cc_recipients = False
        for i_recipient in recipients:
            if not primary_recipient:
                primary_recipient = str(i_recipient)
            if primary_recipient != i_recipient:
                if not cc_recipients:
                    cc_recipients = str(i_recipient)
                else:
                    new_cc_recipients = '%s,%s' % (str(cc_recipients),
                                                   str(i_recipient))
                    cc_recipients = str(new_cc_recipients)
        logger.info(
            'alert_smtp - will send to primary_recipient :: %s, cc_recipients :: %s'
            % (str(primary_recipient), str(cc_recipients)))

    # @modified 20161229 - Feature #1830: Ionosphere alerts
    # Ionosphere alerts
    unencoded_graph_title = 'Skyline %s - ALERT at %s hours - %s' % (
        context, str(int(full_duration_in_hours)), str(metric[0]))
    # @modified 20170603 - Feature #2034: analyse_derivatives
    # Added deriative functions to convert the values of metrics strictly
    # increasing monotonically to their deriative products in alert graphs and
    # specify it in the graph_title
    known_derivative_metric = False
    try:
        # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
        if settings.REDIS_PASSWORD:
            REDIS_ALERTER_CONN = redis.StrictRedis(
                password=settings.REDIS_PASSWORD,
                unix_socket_path=settings.REDIS_SOCKET_PATH)
        else:
            REDIS_ALERTER_CONN = redis.StrictRedis(
                unix_socket_path=settings.REDIS_SOCKET_PATH)
    except:
        logger.error(traceback.format_exc())
        logger.error('error :: alert_smtp - redis connection failed')
    try:
        derivative_metrics = list(
            REDIS_ALERTER_CONN.smembers('derivative_metrics'))
    except:
        derivative_metrics = []
    redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(base_name))
    if redis_metric_name in derivative_metrics:
        known_derivative_metric = True
    if known_derivative_metric:
        try:
            non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS
        except:
            non_derivative_monotonic_metrics = []
        skip_derivative = in_list(redis_metric_name,
                                  non_derivative_monotonic_metrics)
        if skip_derivative:
            known_derivative_metric = False
    if known_derivative_metric:
        unencoded_graph_title = 'Skyline %s - ALERT at %s hours - derivative graph - %s' % (
            context, str(int(full_duration_in_hours)), str(metric[0]))

    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
        logger.info('debug :: alert_smtp - unencoded_graph_title: %s' %
                    unencoded_graph_title)
    graph_title_string = quote(unencoded_graph_title, safe='')
    graph_title = '&title=%s' % graph_title_string

    graphite_port = '80'
    if settings.GRAPHITE_PORT != '':
        graphite_port = str(settings.GRAPHITE_PORT)

    link = '%s://%s:%s/render/?from=-%shours&target=cactiStyle(%s)%s%s&colorList=orange' % (
        settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port,
        str(int(full_duration_in_hours)), metric[1],
        settings.GRAPHITE_GRAPH_SETTINGS, graph_title)
    # @added 20170603 - Feature #2034: analyse_derivatives
    if known_derivative_metric:
        link = '%s://%s:%s/render/?from=-%shours&target=cactiStyle(nonNegativeDerivative(%s))%s%s&colorList=orange' % (
            settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port,
            str(int(full_duration_in_hours)), metric[1],
            settings.GRAPHITE_GRAPH_SETTINGS, graph_title)

    content_id = metric[1]
    image_data = None
    if settings.SMTP_OPTS.get('embed-images'):
        # @added 20161229 - Feature #1830: Ionosphere alerts
        # Use existing data if files exist
        if os.path.isfile(graphite_image_file):
            try:
                with open(graphite_image_file, 'r') as f:
                    image_data = f.read()
                logger.info('alert_smtp - using existing png - %s' %
                            graphite_image_file)
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: alert_smtp - failed to read image data from existing png - %s'
                    % graphite_image_file)
                logger.error('error :: alert_smtp - %s' % str(link))
                image_data = None

        if image_data is None:
            try:
                # @modified 20170913 - Task #2160: Test skyline with bandit
                # Added nosec to exclude from bandit tests
                image_data = urllib2.urlopen(link).read()  # nosec
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - image data OK')
            except urllib2.URLError:
                logger.error(traceback.format_exc())
                logger.error('error :: alert_smtp - failed to get image graph')
                logger.error('error :: alert_smtp - %s' % str(link))
                image_data = None
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - image data None')

    if LOCAL_DEBUG:
        logger.info(
            'debug :: alert_smtp - Memory usage after image_data: %s (kb)' %
            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    # If we failed to get the image or if it was explicitly disabled,
    # use the image URL instead of the content.
    if image_data is None:
        img_tag = '<img src="%s"/>' % link
    else:
        img_tag = '<img src="cid:%s"/>' % content_id
        if settings.ENABLE_DEBUG or LOCAL_DEBUG:
            logger.info('debug :: alert_smtp - img_tag: %s' % img_tag)

        if settings.IONOSPHERE_ENABLED:
            # Create Ionosphere Graphite image
            # @modified 20161229 - Feature #1830: Ionosphere alerts
            # Only write the data to the file if it does not exist
            if not os.path.isfile(graphite_image_file):
                try:
                    write_data_to_file(skyline_app, graphite_image_file, 'w',
                                       image_data)
                    logger.info('added %s Ionosphere Graphite image :: %s' %
                                (skyline_app, graphite_image_file))
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: failed to add %s Ionosphere Graphite image' %
                        (skyline_app, graphite_image_file))
            else:
                logger.info(
                    '%s Ionosphere Graphite image already exists :: %s' %
                    (skyline_app, graphite_image_file))

    redis_image_data = None
    try:
        plot_redis_data = settings.PLOT_REDIS_DATA
    except:
        plot_redis_data = False

    if settings.SMTP_OPTS.get('embed-images') and plot_redis_data:
        # Create graph from Redis data
        redis_metric_key = '%s%s' % (settings.FULL_NAMESPACE, metric[1])
        try:
            raw_series = REDIS_ALERTER_CONN.get(redis_metric_key)
            if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                logger.info('debug :: alert_smtp - raw_series: %s' % 'OK')
        except:
            if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                logger.info('debug :: alert_smtp - raw_series: %s' % 'FAIL')

        try:
            if LOCAL_DEBUG:
                logger.info(
                    'debug :: alert_smtp - Memory usage before get Redis timeseries data: %s (kb)'
                    % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
            unpacker = Unpacker(use_list=True)
            unpacker.feed(raw_series)
            timeseries_x = [float(item[0]) for item in unpacker]
            unpacker = Unpacker(use_list=True)
            unpacker.feed(raw_series)
            timeseries_y = [item[1] for item in unpacker]

            unpacker = Unpacker(use_list=False)
            unpacker.feed(raw_series)
            timeseries = list(unpacker)
            if LOCAL_DEBUG:
                logger.info(
                    'debug :: alert_smtp - Memory usage after get Redis timeseries data: %s (kb)'
                    % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        except:
            logger.error('error :: alert_smtp - unpack timeseries failed')
            timeseries = None

        if settings.IONOSPHERE_ENABLED and timeseries:
            '''
            .. todo: this is possibly to be used to allow the user to submit the
                FULL_DURATION duration data set for the features profile to be
                created against IF it is a Mirage metric.  This would allow for
                additional granularity in Mirage metrics, thereby maintaining
                their seasonality, but allow user and Skyline to analyze the
                anomaly at a FULL_DURATION resolution as well.  Not sure how to
                code that in Ionosphere context yet but could just be additonal
                flag in the Ionosphere record.  In the Ionosphere frontend, the
                user would be given an option to either create the features
                profile on the Mirage timeseries or the redis FULL_DURATION
                timeseries.  It is a little complicated, but doable.
                # @modified 20161229 - Feature #1828: ionosphere - mirage Redis data features
                However that ^^ is UNDESIRABLE in the Mirage/Ionosphere context
                at the moment.  Ionosphere must only profile SECOND_ORDER_RESOLUTION_HOURS
                currently so as to not pollute the seasonality aspect of Mirage
            '''
            # Create Ionosphere redis timeseries json if is does not exist
            # @modified 20161229 - Feature #1830: Ionosphere alerts
            # Only write the data to the file if it does not exist and replace
            # the timeseries object if a json file exists

            # @added 20170920 - Bug #2168: Strange Redis derivative graph
            using_original_redis_json = False

            if not os.path.isfile(json_file):
                timeseries_json = str(timeseries).replace('[', '(').replace(
                    ']', ')')
                try:
                    write_data_to_file(skyline_app, json_file, 'w',
                                       timeseries_json)
                    logger.info(
                        'added %s Ionosphere Redis data timeseries json file :: %s'
                        % (skyline_app, json_file))
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: failed to add %s Ionosphere Redis data timeseries json file'
                        % (skyline_app, json_file))
            else:
                # Replace the timeseries object
                logger.info(
                    '%s Ionosphere Redis data timeseries json file already exists, using :: %s'
                    % (skyline_app, json_file))
                anomaly_json = json_file
                try:
                    # Read the timeseries json file
                    with open(anomaly_json, 'r') as f:
                        raw_timeseries = f.read()
                    timeseries_array_str = str(raw_timeseries).replace(
                        '(', '[').replace(')', ']')
                    timeseries = literal_eval(timeseries_array_str)
                    logger.info(
                        '%s Redis timeseries replaced with timeseries from :: %s'
                        % (skyline_app, anomaly_json))
                    timeseries_x = [float(item[0]) for item in timeseries]
                    timeseries_y = [item[1] for item in timeseries]
                    # @added 20170920 - Bug #2168: Strange Redis derivative graph
                    # This already has nonNegativeDerivative applied to it
                    using_original_redis_json = True
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: %s failed to read timeseries data from %s' %
                        (skyline_app, anomaly_json))
                    timeseries = None

        # @added 20170603 - Feature #2034: analyse_derivatives
        if known_derivative_metric:

            # @added 20170920 - Bug #2168: Strange Redis derivative graph
            # If this is the Mirage Redis json it already has
            # nonNegativeDerivative applied to it
            if not using_original_redis_json:
                logger.info('alert_smtp - nonNegativeDerivative being applied')

                try:
                    derivative_timeseries = nonNegativeDerivative(timeseries)
                    timeseries = derivative_timeseries
                    # @added 20170920 - Bug #2168: Strange Redis derivative graph
                    logger.info('alert_smtp - nonNegativeDerivative applied')
                except:
                    logger.error(
                        'error :: alert_smtp - nonNegativeDerivative failed')
            else:
                logger.info(
                    'alert_smtp - nonNegativeDerivative not being applied, as it will have been applied in the original json'
                )

        # @added 21070726 - Bug #2068: Analyzer smtp alert error on Redis plot with derivative metrics
        # If the nonNegativeDerivative has been calculated we need to reset the
        # x and y as nonNegativeDerivative has to discard the first value as it
        # has no delta for it so the timeseries is 1 item less.
        timeseries_x = [float(item[0]) for item in timeseries]
        timeseries_y = [item[1] for item in timeseries]

        pd_series_values = None
        if timeseries:
            try:
                if LOCAL_DEBUG:
                    logger.info(
                        'debug :: alert_smtp - Memory usage before pd.Series: %s (kb)'
                        % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
                values = pd.Series([x[1] for x in timeseries])
                # Because the truth value of a Series is ambiguous
                pd_series_values = True
                if LOCAL_DEBUG:
                    logger.info(
                        'debug :: alert_smtp - Memory usage after pd.Series: %s (kb)'
                        % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
            except:
                logger.error(
                    'error :: alert_smtp - pandas value series on timeseries failed'
                )

        if pd_series_values:
            try:
                array_median = np.median(values)
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - values median: %s' %
                                str(array_median))

                array_amax = np.amax(values)
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - array_amax: %s' %
                                str(array_amax))
                array_amin = np.amin(values)
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - array_amin: %s' %
                                str(array_amin))
                mean = values.mean()
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - mean: %s' % str(mean))
                stdDev = values.std()
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - stdDev: %s' %
                                str(stdDev))

                sigma3 = 3 * stdDev
                if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                    logger.info('debug :: alert_smtp - sigma3: %s' %
                                str(sigma3))

                # sigma3_series = [sigma3] * len(values)

                sigma3_upper_bound = mean + sigma3
                try:
                    sigma3_lower_bound = mean - sigma3
                except:
                    sigma3_lower_bound = 0

                sigma3_upper_series = [sigma3_upper_bound] * len(values)
                sigma3_lower_series = [sigma3_lower_bound] * len(values)
                amax_series = [array_amax] * len(values)
                amin_series = [array_amin] * len(values)
                mean_series = [mean] * len(values)
            except:
                logger.error(
                    'error :: alert_smtp - numpy ops on series failed')
                mean_series = None

        if mean_series:
            graph_title = 'Skyline %s - ALERT - at %s hours - Redis data\n%s - anomalous value: %s' % (
                context, str(
                    int(full_duration_in_hours)), metric[1], str(metric[0]))
            # @added 20170603 - Feature #2034: analyse_derivatives
            if known_derivative_metric:
                graph_title = 'Skyline %s - ALERT - at %s hours - Redis data (derivative graph)\n%s - anomalous value: %s' % (
                    context, str(int(full_duration_in_hours)), metric[1],
                    str(metric[0]))

            # @modified 20160814 - Bug #1558: Memory leak in Analyzer
            # I think the buf is causing a memory leak, trying a file
            # if python_version == 3:
            #     buf = io.StringIO()
            # else:
            #     buf = io.BytesIO()
            buf = '%s/%s.%s.%s.png' % (settings.SKYLINE_TMP_DIR, skyline_app,
                                       str(int(metric[2])), metric[1])

            if LOCAL_DEBUG:
                logger.info(
                    'debug :: alert_smtp - Memory usage before plot Redis data: %s (kb)'
                    % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

            # Too big
            # rcParams['figure.figsize'] = 12, 6
            rcParams['figure.figsize'] = 8, 4
            try:
                # fig = plt.figure()
                fig = plt.figure(frameon=False)
                ax = fig.add_subplot(111)
                ax.set_title(graph_title, fontsize='small')
                # @modified 20180417 - Bug #2358: set_axis_bgcolor method removed from Matplotlib - Luminosity
                #                      IssueID #49 'AxesSubplot' object has no attribute 'set_axis_bgcolor'
                # ax.set_axis_bgcolor('black')
                if hasattr(ax, 'set_facecolor'):
                    ax.set_facecolor('black')
                else:
                    ax.set_axis_bgcolor('black')

                try:
                    datetimes = [
                        dt.datetime.utcfromtimestamp(ts) for ts in timeseries_x
                    ]
                    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                        logger.info('debug :: alert_smtp - datetimes: %s' %
                                    'OK')
                except:
                    logger.error('error :: alert_smtp - datetimes: %s' %
                                 'FAIL')

                plt.xticks(rotation=0, horizontalalignment='center')
                xfmt = DateFormatter('%a %H:%M')
                plt.gca().xaxis.set_major_formatter(xfmt)

                ax.xaxis.set_major_formatter(xfmt)

                ax.plot(datetimes,
                        timeseries_y,
                        color='orange',
                        lw=0.6,
                        zorder=3)
                ax.tick_params(axis='both', labelsize='xx-small')

                max_value_label = 'max - %s' % str(array_amax)
                ax.plot(datetimes,
                        amax_series,
                        lw=1,
                        label=max_value_label,
                        color='m',
                        ls='--',
                        zorder=4)
                min_value_label = 'min - %s' % str(array_amin)
                ax.plot(datetimes,
                        amin_series,
                        lw=1,
                        label=min_value_label,
                        color='b',
                        ls='--',
                        zorder=4)
                mean_value_label = 'mean - %s' % str(mean)
                ax.plot(datetimes,
                        mean_series,
                        lw=1.5,
                        label=mean_value_label,
                        color='g',
                        ls='--',
                        zorder=4)

                sigma3_text = (r'3$\sigma$')
                # sigma3_label = '%s - %s' % (str(sigma3_text), str(sigma3))

                sigma3_upper_label = '%s upper - %s' % (
                    str(sigma3_text), str(sigma3_upper_bound))
                ax.plot(datetimes,
                        sigma3_upper_series,
                        lw=1,
                        label=sigma3_upper_label,
                        color='r',
                        ls='solid',
                        zorder=4)

                if sigma3_lower_bound > 0:
                    sigma3_lower_label = '%s lower - %s' % (
                        str(sigma3_text), str(sigma3_lower_bound))
                    ax.plot(datetimes,
                            sigma3_lower_series,
                            lw=1,
                            label=sigma3_lower_label,
                            color='r',
                            ls='solid',
                            zorder=4)

                ax.get_yaxis().get_major_formatter().set_useOffset(False)
                ax.get_yaxis().get_major_formatter().set_scientific(False)

                # Shrink current axis's height by 10% on the bottom
                box = ax.get_position()
                ax.set_position([
                    box.x0, box.y0 + box.height * 0.1, box.width,
                    box.height * 0.9
                ])

                # Put a legend below current axis
                ax.legend(loc='upper center',
                          bbox_to_anchor=(0.5, -0.05),
                          fancybox=True,
                          shadow=True,
                          ncol=4,
                          fontsize='x-small')
                plt.rc('lines', lw=2, color='w')

                plt.grid(True)

                ax.grid(b=True,
                        which='both',
                        axis='both',
                        color='lightgray',
                        linestyle='solid',
                        alpha=0.5,
                        linewidth=0.6)
                # @modified 20180417 - Bug #2358: set_axis_bgcolor method removed from Matplotlib - Luminosity
                #                      IssueID #49 'AxesSubplot' object has no attribute 'set_axis_bgcolor'
                # ax.set_axis_bgcolor('black')
                if hasattr(ax, 'set_facecolor'):
                    ax.set_facecolor('black')
                else:
                    ax.set_axis_bgcolor('black')

                rcParams['xtick.direction'] = 'out'
                rcParams['ytick.direction'] = 'out'
                ax.margins(y=.02, x=.03)
                # tight_layout removes the legend box
                # fig.tight_layout()
                try:
                    if LOCAL_DEBUG:
                        logger.info(
                            'debug :: alert_smtp - Memory usage before plt.savefig: %s (kb)'
                            %
                            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
                    plt.savefig(buf, format='png')

                    if settings.IONOSPHERE_ENABLED:
                        if not os.path.exists(training_data_dir):
                            mkdir_p(training_data_dir)
                            logger.info('created dir - %s' % training_data_dir)

                        if not os.path.isfile(training_data_redis_image):
                            try:
                                plt.savefig(training_data_redis_image,
                                            format='png')
                                logger.info(
                                    'alert_smtp - save Redis training data image - %s'
                                    % (training_data_redis_image))
                            except:
                                logger.info(traceback.format_exc())
                                logger.error(
                                    'error :: alert_smtp - could not save - %s'
                                    % (training_data_redis_image))
                        else:
                            logger.info(
                                'alert_smtp - Redis training data image already exists - %s'
                                % (training_data_redis_image))

                    # @added 20160814 - Bug #1558: Memory leak in Analyzer
                    # As per http://www.mail-archive.com/[email protected]/msg13222.html
                    # savefig in the parent process was causing the memory leak
                    # the below fig.clf() and plt.close() did not resolve this
                    # however spawing a multiprocessing process for alert_smtp
                    # does solve this as issue as all memory is freed when the
                    # process terminates.
                    fig.clf()
                    plt.close(fig)
                    redis_graph_content_id = 'redis.%s' % metric[1]
                    redis_image_data = True
                    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                        logger.info('debug :: alert_smtp - savefig: %s' % 'OK')
                        logger.info(
                            'debug :: alert_smtp - Memory usage after plt.savefig: %s (kb)'
                            %
                            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
                except:
                    logger.info(traceback.format_exc())
                    logger.error('error :: alert_smtp - plt.savefig: %s' %
                                 'FAIL')
            except:
                logger.error(traceback.format_exc())
                logger.error('error :: alert_smtp - could not build plot')

    if LOCAL_DEBUG:
        logger.info(
            'debug :: alert_smtp - Memory usage before email: %s (kb)' %
            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    if redis_image_data:
        redis_img_tag = '<img src="cid:%s"/>' % redis_graph_content_id
        if settings.ENABLE_DEBUG or LOCAL_DEBUG:
            logger.info('debug :: alert_smtp - redis_img_tag: %s' %
                        str(redis_img_tag))
    else:
        # @modified 20161229 - Feature #1830: Ionosphere alerts
        # @modified 20170108 - Feature #1852: Ionosphere - features_profile matched graphite graphs
        # Restored the previous redis_img_tag method as some smtp alerts were
        # coming without a Redis graph, not all but some and for some reason,
        # I am pretty certain retrospectively that it was done that way from
        # testing I just wanted to try and be cleaner.
        # The redis_img_tag was changed at
        # https://github.com/earthgecko/skyline/commit/31bcacf3f90f0953ebed0d57260cb937e01f887c#diff-520bf2a218f65074ffead4d8184c138dR489
        redis_img_tag = '<img src="%s"/>' % 'none'
        # redis_img_tag = '<img src="none"/>'

    # @added 20170806 - Feature #1830: Ionosphere alerts
    # Show a human date in alerts
    alerted_at = str(dt.datetime.utcfromtimestamp(int(metric[2])))

    try:
        body = '<h3><font color="#dd3023">Sky</font><font color="#6698FF">line</font><font color="black"> %s alert</font></h3><br>' % context
        body += '<font color="black">metric: <b>%s</b></font><br>' % metric[1]
        body += '<font color="black">Anomalous value: %s</font><br>' % str(
            metric[0])
        body += '<font color="black">Anomaly timestamp: %s</font><br>' % str(
            int(metric[2]))
        # @added 20170806 - Feature #1830: Ionosphere alerts
        # Show a human date in alerts
        body += '<font color="black">Anomalous at: %s</font><br>' % alerted_at
        body += '<font color="black">At hours: %s</font><br>' % str(
            int(full_duration_in_hours))
        body += '<font color="black">Next alert in: %s seconds</font><br>' % str(
            alert[2])
        # @added 20170603 - Feature #2034: analyse_derivatives
        if known_derivative_metric:
            body += '<font color="black">Derivative graph: True</font><br>'

        more_body = ''
        if settings.IONOSPHERE_ENABLED:
            # @modified 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls
            # Broke body into body and more_body to workaround the 990 character
            # limit per line for SMTP
            more_body += '<h3><font color="#dd3023">Ionosphere :: </font><font color="#6698FF">training data</font><font color="black"></font></h3>'
            ionosphere_link = '%s/ionosphere?timestamp=%s&metric=%s' % (
                settings.SKYLINE_URL, str(int(metric[2])), str(metric[1]))
            more_body += '<font color="black">To use this timeseries to train Skyline that this is not anomalous manage this training data at:<br>'
            more_body += '<a href="%s">%s</a></font>' % (ionosphere_link,
                                                         ionosphere_link)
        if redis_image_data:
            more_body += '<font color="black">min: %s  | max: %s   | mean: %s <br>' % (
                str(array_amin), str(array_amax), str(mean))
            more_body += '3-sigma: %s <br>' % str(sigma3)
            more_body += '3-sigma upper bound: %s   | 3-sigma lower bound: %s <br></font>' % (
                str(sigma3_upper_bound), str(sigma3_lower_bound))
            more_body += '<h3><font color="black">Redis data at FULL_DURATION</font></h3><br>'
            more_body += '<div dir="ltr">:%s<br></div>' % redis_img_tag
        if image_data:
            more_body += '<h3><font color="black">Graphite data at FULL_DURATION (may be aggregated)</font></h3>'
            more_body += '<div dir="ltr"><a href="%s">%s</a><br></div><br>' % (
                link, img_tag)
            more_body += '<font color="black">Clicking on the above graph will open to the Graphite graph with current data</font><br>'
        if redis_image_data:
            more_body += '<font color="black">To disable the Redis data graph view, set PLOT_REDIS_DATA to False in your settings.py, if the Graphite graph is sufficient for you,<br>'
            more_body += 'however do note that will remove the 3-sigma and mean value too.</font>'
        more_body += '<br>'
        more_body += '<div dir="ltr" align="right"><font color="#dd3023">Sky</font><font color="#6698FF">line</font><font color="black"> version :: %s</font></div><br>' % str(
            skyline_version)
    except:
        logger.error('error :: alert_smtp - could not build body')
        logger.info(traceback.format_exc())

    # @modified 20180524 - Task #2384: Change alerters to cc other recipients
    # Do not send to each recipient, send to primary_recipient and cc the other
    # recipients, thereby sending only one email
    # for recipient in recipients:
    if primary_recipient:
        try:
            # @modified 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls
            # Broke body into body and more_body to workaround the 990 character
            # limit per line for SMTP, using mixed as alternative indicates that
            # the client should select one of the parts for display and ignore
            # the rest (tripleee - https://stackoverflow.com/a/35115938)
            # msg = MIMEMultipart('alternative')
            msg = MIMEMultipart('mixed')

            # @added 20170812 - Bug #2142: 7bit SMTP encoding breaking long urls
            # set email charset and email encodings
            cs_ = charset.Charset('utf-8')
            cs_.header_encoding = charset.QP
            cs_.body_encoding = charset.QP
            msg.set_charset(cs_)

            msg['Subject'] = '[Skyline alert] - %s ALERT - %s' % (context,
                                                                  metric[1])
            msg['From'] = sender
            # @modified 20180524 - Task #2384: Change alerters to cc other recipients
            # msg['To'] = recipient
            msg['To'] = primary_recipient

            # @added 20180524 - Task #2384: Change alerters to cc other recipients
            # Added Cc
            if cc_recipients:
                msg['Cc'] = cc_recipients

            msg.attach(MIMEText(body, 'html'))
            # @added 20170823 - Bug #2142: 7bit SMTP encoding breaking long urls
            # Broke body into body and more_body to workaround the 990 character
            # limit per line for SMTP
            msg.replace_header('content-transfer-encoding', 'quoted-printable')
            msg.attach(MIMEText(more_body, 'html'))

            if redis_image_data:
                try:
                    # @modified 20160814 - Bug #1558: Memory leak in Analyzer
                    # I think the buf is causing a memory leak, trying a file
                    # buf.seek(0)
                    # msg_plot_attachment = MIMEImage(buf.read())
                    # msg_plot_attachment = MIMEImage(buf.read())
                    try:
                        with open(buf, 'r') as f:
                            plot_image_data = f.read()
                        try:
                            os.remove(buf)
                        except OSError:
                            logger.error(
                                'error :: alert_smtp - failed to remove file - %s'
                                % buf)
                            logger.info(traceback.format_exc())
                            pass
                    except:
                        logger.error('error :: failed to read plot file - %s' %
                                     buf)
                        plot_image_data = None

                    # @added 20161124 - Branch #922: ionosphere
                    msg_plot_attachment = MIMEImage(plot_image_data)
                    msg_plot_attachment.add_header(
                        'Content-ID', '<%s>' % redis_graph_content_id)
                    msg.attach(msg_plot_attachment)
                    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                        logger.info(
                            'debug :: alert_smtp - msg_plot_attachment - redis data done'
                        )
                except:
                    logger.error('error :: alert_smtp - msg_plot_attachment')
                    logger.info(traceback.format_exc())

            if image_data is not None:
                try:
                    msg_attachment = MIMEImage(image_data)
                    msg_attachment.add_header('Content-ID',
                                              '<%s>' % content_id)
                    msg.attach(msg_attachment)
                    if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                        logger.info(
                            'debug :: alert_smtp - msg_attachment - Graphite img source done'
                        )
                except:
                    logger.error('error :: alert_smtp - msg_attachment')
                    logger.info(traceback.format_exc())
        except:
            logger.error('error :: alert_smtp - could not attach')
            logger.info(traceback.format_exc())

        s = SMTP('127.0.0.1')
        try:
            # @modified 20180524 - Task #2384: Change alerters to cc other recipients
            # Send to primary_recipient and cc_recipients
            # s.sendmail(sender, recipient, msg.as_string())
            if cc_recipients:
                s.sendmail(sender, [primary_recipient, cc_recipients],
                           msg.as_string())
            else:
                s.sendmail(sender, primary_recipient, msg.as_string())
            if settings.ENABLE_DEBUG or LOCAL_DEBUG:
                # logger.info('debug :: alert_smtp - message sent to %s OK' % str(recipient))
                logger.info(
                    'debug :: alert_smtp - message sent OK to primary_recipient :: %s, cc_recipients :: %s'
                    % (str(primary_recipient), str(cc_recipients)))
        except:
            logger.info(traceback.format_exc())
            # logger.error('error :: alert_smtp - could not send email to %s' % str(recipient))
            logger.error(
                'error :: alert_smtp - could not send email to primary_recipient :: %s, cc_recipients :: %s'
                % (str(primary_recipient), str(cc_recipients)))

        s.quit()

    if LOCAL_DEBUG:
        logger.info('debug :: alert_smtp - Memory usage after email: %s (kb)' %
                    resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    if redis_image_data:
        # buf.seek(0)
        # buf.write('none')
        if LOCAL_DEBUG:
            logger.info(
                'debug :: alert_smtp - Memory usage before del redis_image_data objects: %s (kb)'
                % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        del raw_series
        del unpacker
        del timeseries[:]
        del timeseries_x[:]
        del timeseries_y[:]
        del values
        del datetimes[:]
        del msg_plot_attachment
        del redis_image_data
        # We del all variables that are floats as they become unique objects and
        # can result in what appears to be a memory leak, but is not, it is
        # just the way Python handles floats
        del mean
        del array_amin
        del array_amax
        del stdDev
        del sigma3
        if LOCAL_DEBUG:
            logger.info(
                'debug :: alert_smtp - Memory usage after del redis_image_data objects: %s (kb)'
                % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        if LOCAL_DEBUG:
            logger.info(
                'debug :: alert_smtp - Memory usage before del fig object: %s (kb)'
                % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        # @added 20160814 - Bug #1558: Memory leak in Analyzer
        #                   Issue #21 Memory leak in Analyzer - https://github.com/earthgecko/skyline/issues/21
        # As per http://www.mail-archive.com/[email protected]/msg13222.html
        fig.clf()
        plt.close(fig)
        del fig
        if LOCAL_DEBUG:
            logger.info(
                'debug :: alert_smtp - Memory usage after del fig object: %s (kb)'
                % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    if LOCAL_DEBUG:
        logger.info(
            'debug :: alert_smtp - Memory usage before del other objects: %s (kb)'
            % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    del recipients[:]
    del body
    del msg
    del image_data
    del msg_attachment
    if LOCAL_DEBUG:
        logger.info(
            'debug :: alert_smtp - Memory usage after del other objects: %s (kb)'
            % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    return
示例#7
0
def m66(current_skyline_app, parent_pid, timeseries, algorithm_parameters):
    """
    A time series data points are anomalous if the 6th median is 6 standard
    deviations (six-sigma) from the time series 6th median standard deviation
    and persists for x_windows, where `x_windows = int(window / 2)`.
    This algorithm finds SIGNIFICANT cahngepoints in a time series, similar to
    PELT and Bayesian Online Changepoint Detection, however it is more robust to
    instaneous outliers and more conditionally selective of changepoints.

    :param current_skyline_app: the Skyline app executing the algorithm.  This
        will be passed to the algorithm by Skyline.  This is **required** for
        error handling and logging.  You do not have to worry about handling the
        argument in the scope of the custom algorithm itself,  but the algorithm
        must accept it as the first agrument.
    :param parent_pid: the parent pid which is executing the algorithm, this is
        **required** for error handling and logging.  You do not have to worry
        about handling this argument in the scope of algorithm, but the
        algorithm must accept it as the second argument.
    :param timeseries: the time series as a list e.g. ``[[1578916800.0, 29.0],
        [1578920400.0, 55.0], ... [1580353200.0, 55.0]]``
    :param algorithm_parameters: a dictionary of any required parameters for the
        custom_algorithm and algorithm itself for example:
        ``algorithm_parameters={
            'nth_median': 6,
            'sigma': 6,
            'window': 5,
            'return_anomalies' = True,
        }``
    :type current_skyline_app: str
    :type parent_pid: int
    :type timeseries: list
    :type algorithm_parameters: dict
    :return: True, False or Non
    :rtype: boolean

    Example CUSTOM_ALGORITHMS configuration:

    'm66': {
        'namespaces': [
            'skyline.analyzer.run_time', 'skyline.analyzer.total_metrics',
            'skyline.analyzer.exceptions'
        ],
        'algorithm_source': '/opt/skyline/github/skyline/skyline/custom_algorithms/m66.py',
        'algorithm_parameters': {
            'nth_median': 6, 'sigma': 6, 'window': 5, 'resolution': 60,
            'minimum_sparsity': 0, 'determine_duration': False,
            'return_anomalies': True, 'save_plots_to': False,
            'save_plots_to_absolute_dir': False, 'filename_prefix': False
        },
        'max_execution_time': 1.0
        'consensus': 1,
        'algorithms_allowed_in_consensus': ['m66'],
        'run_3sigma_algorithms': False,
        'run_before_3sigma': False,
        'run_only_if_consensus': False,
        'use_with': ['crucible', 'luminosity'],
        'debug_logging': False,
    },

    """

    # You MUST define the algorithm_name
    algorithm_name = 'm66'

    # Define the default state of None and None, anomalous does not default to
    # False as that is not correct, False is only correct if the algorithm
    # determines the data point is not anomalous.  The same is true for the
    # anomalyScore.
    anomalous = None
    anomalyScore = None

    return_anomalies = False
    anomalies = []
    anomalies_dict = {}
    anomalies_dict['algorithm'] = algorithm_name

    realtime_analysis = False

    current_logger = None
    dev_null = None

    # If you wanted to log, you can but this should only be done during
    # testing and development
    def get_log(current_skyline_app):
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        return current_logger

    start = timer()

    # Use the algorithm_parameters to determine the sample_period
    debug_logging = None
    try:
        debug_logging = algorithm_parameters['debug_logging']
    except:
        debug_logging = False
    if debug_logging:
        try:
            current_logger = get_log(current_skyline_app)
            current_logger.debug(
                'debug :: %s :: debug_logging enabled with algorithm_parameters - %s'
                % (algorithm_name, str(algorithm_parameters)))
        except Exception as e:
            # This except pattern MUST be used in ALL custom algortihms to
            # facilitate the traceback from any errors.  The algorithm we want to
            # run super fast and without spamming the log with lots of errors.
            # But we do not want the function returning and not reporting
            # anything to the log, so the pythonic except is used to "sample" any
            # algorithm errors to a tmp file and report once per run rather than
            # spewing tons of errors into the log e.g. analyzer.log
            dev_null = e
            record_algorithm_error(current_skyline_app, parent_pid,
                                   algorithm_name, traceback.format_exc())
            # Return None and None as the algorithm could not determine True or False
            del dev_null
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)

    # Allow the m66 parameters to be passed in the algorithm_parameters
    window = 6
    try:
        window = algorithm_parameters['window']
    except KeyError:
        window = 6
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    nth_median = 6
    try:
        nth_median = algorithm_parameters['nth_median']
    except KeyError:
        nth_median = 6
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    n_sigma = 6
    try:
        n_sigma = algorithm_parameters['sigma']
    except KeyError:
        n_sigma = 6
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    resolution = 0
    try:
        resolution = algorithm_parameters['resolution']
    except KeyError:
        resolution = 0
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    determine_duration = False
    try:
        determine_duration = algorithm_parameters['determine_duration']
    except KeyError:
        determine_duration = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    minimum_sparsity = 0
    try:
        minimum_sparsity = algorithm_parameters['minimum_sparsity']
    except KeyError:
        minimum_sparsity = 0
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    shift_to_start_of_window = True
    try:
        shift_to_start_of_window = algorithm_parameters[
            'shift_to_start_of_window']
    except KeyError:
        shift_to_start_of_window = True
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to = False
    try:
        save_plots_to = algorithm_parameters['save_plots_to']
    except KeyError:
        save_plots_to = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to_absolute_dir = False
    try:
        save_plots_to_absolute_dir = algorithm_parameters[
            'save_plots_to_absolute_dir']
    except KeyError:
        save_plots_to_absolute_dir = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e
    filename_prefix = False
    try:
        filename_prefix = algorithm_parameters['filename_prefix']
    except KeyError:
        filename_prefix = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    if debug_logging:
        current_logger.debug('debug :: algorithm_parameters :: %s' %
                             (str(algorithm_parameters)))

    return_anomalies = False
    try:
        return_anomalies = algorithm_parameters['return_anomalies']
    except KeyError:
        return_anomalies = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    try:
        realtime_analysis = algorithm_parameters['realtime_analysis']
    except KeyError:
        realtime_analysis = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to = False
    try:
        save_plots_to = algorithm_parameters['save_plots_to']
    except KeyError:
        save_plots_to = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to_absolute_dir = False
    try:
        save_plots_to_absolute_dir = algorithm_parameters[
            'save_plots_to_absolute_dir']
    except KeyError:
        save_plots_to = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e
    filename_prefix = False
    try:
        filename_prefix = algorithm_parameters['filename_prefix']
    except KeyError:
        filename_prefix = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    try:
        base_name = algorithm_parameters['base_name']
    except Exception as e:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        dev_null = e
        del dev_null
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (False, None, anomalies)
        return (False, None)
    if debug_logging:
        current_logger.debug('debug :: %s :: base_name - %s' %
                             (algorithm_name, str(base_name)))

    anomalies_dict['metric'] = base_name
    anomalies_dict['anomalies'] = {}

    use_bottleneck = True
    if save_plots_to:
        use_bottleneck = False
    if use_bottleneck:
        import bottleneck as bn

    # ALWAYS WRAP YOUR ALGORITHM IN try and the BELOW except
    try:
        start_preprocessing = timer()

        # INFO: Sorting time series of 10079 data points took 0.002215 seconds
        timeseries = sorted(timeseries, key=lambda x: x[0])
        if debug_logging:
            current_logger.debug('debug :: %s :: time series of length - %s' %
                                 (algorithm_name, str(len(timeseries))))

        # Testing the data to ensure it meets minimum requirements, in the case
        # of Skyline's use of the m66 algorithm this means that:
        # - the time series must have at least 75% of its full_duration
        do_not_use_sparse_data = False
        if current_skyline_app == 'luminosity':
            do_not_use_sparse_data = True

        if minimum_sparsity == 0:
            do_not_use_sparse_data = False

        total_period = 0
        total_datapoints = 0

        calculate_variables = False
        if do_not_use_sparse_data:
            calculate_variables = True
        if determine_duration:
            calculate_variables = True

        if calculate_variables:
            try:
                start_timestamp = int(timeseries[0][0])
                end_timestamp = int(timeseries[-1][0])
                total_period = end_timestamp - start_timestamp
                total_datapoints = len(timeseries)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug(
                        'debug_logging :: %s :: SystemExit called, exiting - %s'
                        % (algorithm_name, e))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid,
                                       algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error(
                        'error :: debug_logging :: %s :: failed to determine total_period and total_datapoints'
                        % (algorithm_name))
                timeseries = []
            if not timeseries:
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)

            if current_skyline_app == 'analyzer':
                # Default for analyzer at required period to 18 hours
                period_required = int(FULL_DURATION * 0.75)
            else:
                # Determine from timeseries
                if total_period < FULL_DURATION:
                    period_required = int(FULL_DURATION * 0.75)
                else:
                    period_required = int(total_period * 0.75)

            if determine_duration:
                period_required = int(total_period * 0.75)

        if do_not_use_sparse_data:
            # If the time series does not have 75% of its full_duration it does
            # not have sufficient data to sample
            try:
                if total_period < period_required:
                    if debug_logging:
                        current_logger.debug(
                            'debug :: %s :: time series does not have sufficient data'
                            % (algorithm_name))
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug(
                        'debug_logging :: %s :: SystemExit called, exiting - %s'
                        % (algorithm_name, e))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid,
                                       algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error(
                        'error :: debug_logging :: %s :: falied to determine if time series has sufficient data'
                        % (algorithm_name))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)

            # If the time series does not have 75% of its full_duration
            # datapoints it does not have sufficient data to sample

            # Determine resolution from the last 30 data points
            # INFO took 0.002060 seconds
            if not resolution:
                resolution_timestamps = []
                metric_resolution = False
                for metric_datapoint in timeseries[-30:]:
                    timestamp = int(metric_datapoint[0])
                    resolution_timestamps.append(timestamp)
                timestamp_resolutions = []
                if resolution_timestamps:
                    last_timestamp = None
                    for timestamp in resolution_timestamps:
                        if last_timestamp:
                            resolution = timestamp - last_timestamp
                            timestamp_resolutions.append(resolution)
                            last_timestamp = timestamp
                        else:
                            last_timestamp = timestamp
                    try:
                        del resolution_timestamps
                    except:
                        pass
                if timestamp_resolutions:
                    try:
                        timestamp_resolutions_count = Counter(
                            timestamp_resolutions)
                        ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common(
                        )
                        metric_resolution = int(
                            ordered_timestamp_resolutions_count[0][0])
                    except SystemExit as e:
                        if debug_logging:
                            current_logger.debug(
                                'debug_logging :: %s :: SystemExit called, exiting - %s'
                                % (algorithm_name, e))
                        if current_skyline_app == 'webapp':
                            return (anomalous, anomalyScore, anomalies,
                                    anomalies_dict)
                        if return_anomalies:
                            return (anomalous, anomalyScore, anomalies)
                        return (anomalous, anomalyScore)
                    except:
                        traceback_msg = traceback.format_exc()
                        record_algorithm_error(current_skyline_app, parent_pid,
                                               algorithm_name, traceback_msg)
                        if debug_logging:
                            current_logger.error(traceback_msg)
                            current_logger.error(
                                'error :: debug_logging :: %s :: failed to determine if time series has sufficient data'
                                % (algorithm_name))
                    try:
                        del timestamp_resolutions
                    except:
                        pass
            else:
                metric_resolution = resolution

            minimum_datapoints = None
            if metric_resolution:
                minimum_datapoints = int(period_required / metric_resolution)
            if minimum_datapoints:
                if total_datapoints < minimum_datapoints:
                    if debug_logging:
                        current_logger.debug(
                            'debug :: %s :: time series does not have sufficient data, minimum_datapoints required is %s and time series has %s'
                            % (algorithm_name, str(minimum_datapoints),
                               str(total_datapoints)))
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)

            # Is the time series fully populated?
            # full_duration_datapoints = int(full_duration / metric_resolution)
            total_period_datapoints = int(total_period / metric_resolution)
            # minimum_percentage_sparsity = 95
            minimum_percentage_sparsity = 90
            sparsity = int(total_datapoints / (total_period_datapoints / 100))
            if sparsity < minimum_percentage_sparsity:
                if debug_logging:
                    current_logger.debug(
                        'debug :: %s :: time series does not have sufficient data, minimum_percentage_sparsity required is %s and time series has %s'
                        % (algorithm_name, str(minimum_percentage_sparsity),
                           str(sparsity)))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)
            if len(set(item[1] for item in timeseries)) == 1:
                if debug_logging:
                    current_logger.debug(
                        'debug :: %s :: time series does not have sufficient variability, all the values are the same'
                        % algorithm_name)
                anomalous = False
                anomalyScore = 0.0
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)

        end_preprocessing = timer()
        preprocessing_runtime = end_preprocessing - start_preprocessing
        if debug_logging:
            current_logger.debug(
                'debug :: %s :: preprocessing took %.6f seconds' %
                (algorithm_name, preprocessing_runtime))

        if not timeseries:
            if debug_logging:
                current_logger.debug('debug :: %s :: m66 not run as no data' %
                                     (algorithm_name))
            anomalies = []
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)
        if debug_logging:
            current_logger.debug('debug :: %s :: timeseries length: %s' %
                                 (algorithm_name, str(len(timeseries))))

        anomalies_dict['timestamp'] = int(timeseries[-1][0])
        anomalies_dict['from_timestamp'] = int(timeseries[0][0])

        start_analysis = timer()
        try:
            # bottleneck is used because it is much faster
            # pd dataframe method (1445 data point - 24hrs): took 0.077915 seconds
            # bottleneck method (1445 data point - 24hrs): took 0.005692 seconds
            # numpy and pandas rolling
            # 2021-07-30 12:37:31 :: 2827897 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 136.93 seconds
            # 2021-07-30 12:44:53 :: 2855884 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 148.82 seconds
            # 2021-07-30 12:48:41 :: 2870822 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 145.62 seconds
            # 2021-07-30 12:55:00 :: 2893634 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 139.00 seconds
            # 2021-07-30 12:59:31 :: 2910443 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 144.80 seconds
            # 2021-07-30 13:02:31 :: 2922928 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 143.35 seconds
            # 2021-07-30 14:12:56 :: 3132457 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 129.25 seconds
            # 2021-07-30 14:22:35 :: 3164370 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 125.72 seconds
            # 2021-07-30 14:28:24 :: 3179687 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 222.43 seconds
            # 2021-07-30 14:33:45 :: 3179687 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 244.00 seconds
            # 2021-07-30 14:36:27 :: 3214047 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 141.10 seconds
            # numpy and bottleneck
            # 2021-07-30 16:41:52 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 73.92 seconds
            # 2021-07-30 16:46:46 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 68.84 seconds
            # 2021-07-30 16:51:48 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 70.55 seconds
            # numpy and bottleneck (passing resolution and not calculating in m66)
            # 2021-07-30 16:57:46 :: 3643253 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 65.59 seconds

            if use_bottleneck:
                if len(timeseries) < 10:
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)

                x_np = np.asarray([x[1] for x in timeseries])
                # Fast Min-Max scaling
                data = (x_np - x_np.min()) / (x_np.max() - x_np.min())

                # m66 - calculate to nth_median
                median_count = 0
                while median_count < nth_median:
                    median_count += 1
                    rolling_median_s = bn.move_median(data, window=window)
                    median = rolling_median_s.tolist()
                    data = median
                    if median_count == nth_median:
                        break

                # m66 - calculate the moving standard deviation for the
                # nth_median array
                rolling_std_s = bn.move_std(data, window=window)
                std_nth_median_array = np.nan_to_num(rolling_std_s,
                                                     copy=False,
                                                     nan=0.0,
                                                     posinf=None,
                                                     neginf=None)
                std_nth_median = std_nth_median_array.tolist()
                if debug_logging:
                    current_logger.debug(
                        'debug :: %s :: std_nth_median calculated with bn' %
                        (algorithm_name))
            else:
                df = pd.DataFrame(timeseries, columns=['date', 'value'])
                df['date'] = pd.to_datetime(df['date'], unit='s')
                datetime_index = pd.DatetimeIndex(df['date'].values)
                df = df.set_index(datetime_index)
                df.drop('date', axis=1, inplace=True)
                original_df = df.copy()
                # MinMax scale
                df = (df - df.min()) / (df.max() - df.min())
                # window = 6
                data = df['value'].tolist()

                if len(data) < 10:
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)

                # m66 - calculate to nth_median
                median_count = 0
                while median_count < nth_median:
                    median_count += 1
                    s = pd.Series(data)
                    rolling_median_s = s.rolling(window).median()
                    median = rolling_median_s.tolist()
                    data = median
                    if median_count == nth_median:
                        break

                # m66 - calculate the moving standard deviation for the
                # nth_median array
                s = pd.Series(data)
                rolling_std_s = s.rolling(window).std()

                nth_median_column = 'std_nth_median_%s' % str(nth_median)
                df[nth_median_column] = rolling_std_s.tolist()
                std_nth_median = df[nth_median_column].fillna(0).tolist()

            # m66 - calculate the standard deviation for the entire nth_median
            # array
            metric_stddev = np.std(std_nth_median)
            std_nth_median_n_sigma = []
            anomalies_found = False

            for value in std_nth_median:
                # m66 - if the value in the 6th median array is > six-sigma of
                # the metric_stddev the datapoint is anomalous
                if value > (metric_stddev * n_sigma):
                    std_nth_median_n_sigma.append(1)
                    anomalies_found = True
                else:
                    std_nth_median_n_sigma.append(0)
            std_nth_median_n_sigma_column = 'std_median_%s_%s_sigma' % (
                str(nth_median), str(n_sigma))
            if not use_bottleneck:
                df[std_nth_median_n_sigma_column] = std_nth_median_n_sigma

            anomalies = []
            # m66 - only label anomalous if the n_sigma triggers are persisted
            # for (window / 2)
            if anomalies_found:
                current_triggers = []
                for index, item in enumerate(timeseries):
                    if std_nth_median_n_sigma[index] == 1:
                        current_triggers.append(index)
                    else:
                        if len(current_triggers) > int(window / 2):
                            for trigger_index in current_triggers:
                                # Shift the anomaly back to the beginning of the
                                # window
                                if shift_to_start_of_window:
                                    anomalies.append(
                                        timeseries[(trigger_index -
                                                    (window * int(
                                                        (nth_median / 2))))])
                                else:
                                    anomalies.append(timeseries[trigger_index])
                        current_triggers = []
                # Process any remaining current_triggers
                if len(current_triggers) > int(window / 2):
                    for trigger_index in current_triggers:
                        # Shift the anomaly back to the beginning of the
                        # window
                        if shift_to_start_of_window:
                            anomalies.append(
                                timeseries[(trigger_index - (window * int(
                                    (nth_median / 2))))])
                        else:
                            anomalies.append(timeseries[trigger_index])
            if not anomalies:
                anomalous = False

            if anomalies:
                anomalous = True
                anomalies_data = []
                anomaly_timestamps = [int(item[0]) for item in anomalies]
                for item in timeseries:
                    if int(item[0]) in anomaly_timestamps:
                        anomalies_data.append(1)
                    else:
                        anomalies_data.append(0)
                if not use_bottleneck:
                    df['anomalies'] = anomalies_data
                anomalies_list = []
                for ts, value in timeseries:
                    if int(ts) in anomaly_timestamps:
                        anomalies_list.append([int(ts), value])
                        anomalies_dict['anomalies'][int(ts)] = value

            if anomalies and save_plots_to:
                try:
                    from adtk.visualization import plot
                    metric_dir = base_name.replace('.', '/')
                    timestamp_dir = str(int(timeseries[-1][0]))
                    save_path = '%s/%s/%s/%s' % (save_plots_to, algorithm_name,
                                                 metric_dir, timestamp_dir)
                    if save_plots_to_absolute_dir:
                        save_path = '%s' % save_plots_to
                    anomalies_dict['file_path'] = save_path
                    save_to_file = '%s/%s.%s.png' % (save_path, algorithm_name,
                                                     base_name)
                    if filename_prefix:
                        save_to_file = '%s/%s.%s.%s.png' % (
                            save_path, filename_prefix, algorithm_name,
                            base_name)
                    save_to_path = os_path_dirname(save_to_file)
                    title = '%s\n%s - median %s %s-sigma persisted (window=%s)' % (
                        base_name, algorithm_name, str(nth_median),
                        str(n_sigma), str(window))

                    if not os_path_exists(save_to_path):
                        try:
                            mkdir_p(save_to_path)
                        except Exception as e:
                            current_logger.error(
                                'error :: %s :: failed to create dir - %s - %s'
                                % (algorithm_name, save_to_path, e))
                    if os_path_exists(save_to_path):
                        try:
                            plot(original_df['value'],
                                 anomaly=df['anomalies'],
                                 anomaly_color='red',
                                 title=title,
                                 save_to_file=save_to_file)
                            if debug_logging:
                                current_logger.debug(
                                    'debug :: %s :: plot saved to - %s' %
                                    (algorithm_name, save_to_file))
                            anomalies_dict['image'] = save_to_file
                        except Exception as e:
                            current_logger.error(
                                'error :: %s :: failed to plot - %s - %s' %
                                (algorithm_name, base_name, e))
                    anomalies_file = '%s/%s.%s.anomalies_list.txt' % (
                        save_path, algorithm_name, base_name)
                    with open(anomalies_file, 'w') as fh:
                        fh.write(str(anomalies_list))
                        # os.chmod(anomalies_file, mode=0o644)
                    data_file = '%s/data.txt' % (save_path)
                    with open(data_file, 'w') as fh:
                        fh.write(str(anomalies_dict))
                except SystemExit as e:
                    if debug_logging:
                        current_logger.debug(
                            'debug_logging :: %s :: SystemExit called during save plot, exiting - %s'
                            % (algorithm_name, e))
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)
                except Exception as e:
                    traceback_msg = traceback.format_exc()
                    record_algorithm_error(current_skyline_app, parent_pid,
                                           algorithm_name, traceback_msg)
                    if debug_logging:
                        current_logger.error(traceback_msg)
                        current_logger.error(
                            'error :: %s :: failed to plot or save anomalies file - %s - %s'
                            % (algorithm_name, base_name, e))

            try:
                del df
            except:
                pass
        except SystemExit as e:
            if debug_logging:
                current_logger.debug(
                    'debug_logging :: %s :: SystemExit called, during analysis, exiting - %s'
                    % (algorithm_name, e))
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)
        except:
            traceback_msg = traceback.format_exc()
            record_algorithm_error(current_skyline_app, parent_pid,
                                   algorithm_name, traceback_msg)
            if debug_logging:
                current_logger.error(traceback_msg)
                current_logger.error(
                    'error :: debug_logging :: %s :: failed to run on ts' %
                    (algorithm_name))
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)

        end_analysis = timer()
        analysis_runtime = end_analysis - start_analysis

        if debug_logging:
            current_logger.debug(
                'debug :: analysis with %s took %.6f seconds' %
                (algorithm_name, analysis_runtime))

        if anomalous:
            anomalyScore = 1.0
        else:
            anomalyScore = 0.0

        if debug_logging:
            current_logger.info(
                '%s :: anomalous - %s, anomalyScore - %s' %
                (algorithm_name, str(anomalous), str(anomalyScore)))

        if debug_logging:
            end = timer()
            processing_runtime = end - start
            current_logger.info('%s :: completed in %.6f seconds' %
                                (algorithm_name, processing_runtime))
        try:
            del timeseries
        except:
            pass
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        return (anomalous, anomalyScore)
    except SystemExit as e:
        if debug_logging:
            current_logger.debug(
                'debug_logging :: %s :: SystemExit called (before StopIteration), exiting - %s'
                % (algorithm_name, e))
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        return (anomalous, anomalyScore)
    except StopIteration:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (False, None, anomalies)
        return (False, None)
    except:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (False, None, anomalies)
        return (False, None)

    if current_skyline_app == 'webapp':
        return (anomalous, anomalyScore, anomalies, anomalies_dict)
    if return_anomalies:
        return (anomalous, anomalyScore, anomalies)
    return (anomalous, anomalyScore)
示例#8
0
文件: echo.py 项目: vpiduri/skyline
def ionosphere_echo(base_name, mirage_full_duration):

    logger = logging.getLogger(skyline_app_logger)
    child_process_pid = os.getpid()
    context = 'ionosphere_echo'
    logger.info('ionosphere_echo :: started with child_process_pid - %s for %s' % (str(child_process_pid), base_name))
    full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60)

    try:
        # Allow for 3 seconds
        ionosphere_echo_max_fp_create_time = (settings.IONOSPHERE_ECHO_MAX_FP_CREATE_TIME - 3)
    except:
        ionosphere_echo_max_fp_create_time = 52
    echo_started_at = int(time())

    def echo_get_an_engine():

        try:
            engine, fail_msg, trace = get_engine(skyline_app)
            return engine, fail_msg, trace
        except:
            trace = traceback.format_exc()
            logger.error('%s' % trace)
            fail_msg = 'error :: ionosphere_echo :: get_an_engine :: failed to get MySQL engine'
            logger.error('%s' % fail_msg)
            return None, fail_msg, trace

    def echo_engine_disposal(engine):
        try:
            if engine:
                try:
                    engine.dispose()
                    logger.info('ionosphere_echo :: MySQL engine disposed of')
                    return True
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: ionosphere_echo :: calling engine.dispose()')
            else:
                logger.info('ionosphere_echo :: no MySQL engine to dispose of')
                return True
        except:
            return False
        return False

    # Determine the metric details from the database
    metrics_id = None
    metric_db_object = None
    engine = None

    # Get the metric db object data to memcache it is exists
    metric_db_object = get_memcache_metric_object(skyline_app, base_name)
    if metric_db_object:
        metrics_id = metric_db_object['id']
        logger.info('ionosphere_echo :: determined metric id %s from memcache for %s' % (str(metrics_id), base_name))
    else:
        # Only if no memcache data
        # Get a MySQL engine
        try:
            engine, log_msg, trace = echo_get_an_engine()
            logger.info('ionosphere_echo :: %s' % log_msg)
            logger.info('ionosphere_echo :: determining metric id from DB as not found in memcache for %s' % (base_name))
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: ionosphere_echo :: could not get a MySQL engine to get metric_db_object')

        if not engine:
            logger.error('error :: ionosphere_echo :: engine not obtained to get metric_db_object')
            logger.info('ionosphere_echo :: exiting this work but not removing work item, as database may be available again before the work expires')
            return

        try:
            metrics_id, metric_db_object = echo_get_metric_from_metrics(base_name, engine)
            echo_engine_disposal(engine)
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: ionosphere_echo :: failed get the metric details from the database')
            logger.info('ionosphere_echo :: exiting this work but not removing work item, as database may be available again before the work expires')

        if not metrics_id:
            logger.error('error :: ionosphere_echo :: failed get the metrics_id from the database')
            logger.info('ionosphere_echo :: exiting this work but not removing work item, as database may be available again before the work expires')
            echo_engine_disposal(engine)
            return

        if not metric_db_object:
            logger.error('error :: ionosphere_echo :: failed get the metric_db_object from the database')
            logger.info('ionosphere_echo :: exiting this work but not removing work item, as database may be available again before the work expires')
            echo_engine_disposal(engine)
            return

    # Determine the metric fp ids from the database
    if not engine:
        logger.info('ionosphere_echo :: getting MySQL engine to determine fp ids for metric id %s - %s' % (str(metrics_id), base_name))
        try:
            engine, log_msg, trace = echo_get_an_engine()
            logger.info(log_msg)
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: ionosphere_echo :: could not get a MySQL engine to get fp_ids')

    if not engine:
        logger.error('error :: ionosphere_echo :: engine not obtained to get fp_ids')
        return

    try:
        ionosphere_table, log_msg, trace = ionosphere_table_meta(skyline_app, engine)
        logger.info(log_msg)
        logger.info('ionosphere_echo :: ionosphere_table OK')
    except:
        logger.error(traceback.format_exc())
        logger.error('error :: ionosphere_echo :: failed to get ionosphere_table meta for %s' % base_name)

    # Determine the fp_ids that exist for the metric
    echo_fp_ids_result = []
    try:
        connection = engine.connect()
        stmt = select([ionosphere_table]).where(ionosphere_table.c.metric_id == metrics_id).order_by(desc(ionosphere_table.c.id))
        echo_fp_ids = connection.execute(stmt)
        echo_fp_ids_result = [{column: value for column, value in rowproxy.items()} for rowproxy in echo_fp_ids]
        connection.close()
    except:
        logger.error(traceback.format_exc())
        logger.error('error :: ionosphere_echo :: could not determine fp ids from DB for %s' % base_name)
        return

    if engine:
        echo_engine_disposal(engine)

    if not echo_fp_ids_result:
        logger.error('error :: ionosphere_echo :: no echo_fp_ids_result - could not determine fp ids from DB for %s' % base_name)
    else:
        logger.info('ionosphere_echo :: echo_fp_ids_result - determined fp ids from DB for %s' % base_name)

    try:
        db_fp_count = len(echo_fp_ids_result)
        logger.info('ionosphere_echo :: %s features profile ids found' % str(db_fp_count))
    except:
        logger.error(traceback.format_exc())
        logger.error('error :: ionosphere_echo :: could not calculate len of echo_fp_ids_result')

    echo_enabled_mirage_fp_ids = []
    for row in echo_fp_ids_result:
        if row['enabled'] != 1:
            continue
        if row['deleted'] == 1:
            continue
        # Only create features profiles at settings.FULL_DURATION for Mirage
        # features profiles if the fp has been validated
        if row['validated'] == 0:
            continue

        if row['full_duration'] == int(mirage_full_duration):
            fp_id = row['id']
            # @added 20190413 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
            #                   Feature #1960: ionosphere_layers
            # Any features profiles that were created before the introduction of
            # ionosphere_layers will not have a mirage.redis.24h.json file as the
            # creation of these resources in the training_data dir was only added at
            # that point, so they are excluded.  Less than 20170307 1488844800 excl.
            # @modified 20190508 - Bug #2934: Ionosphere - no mirage.redis.24h.json file
            #                      Feature #1960: ionosphere_layers
            # Increased the date as some on the cusp were still erroring, move
            # forward 2 days to 20170309
            # if int(row['anomaly_timestamp']) < 1488844800:
            if int(row['anomaly_timestamp']) < 1489017600:
                logger.info('ionosphere_echo :: skipping fp id %s as predates having a mirage.redis json file' % str(fp_id))
                continue
            echo_enabled_mirage_fp_ids.append(fp_id)

    echo_enabled_mirage_fp_ids_count = len(echo_enabled_mirage_fp_ids)
    logger.info('ionosphere_echo :: %s Mirage features profile ids found' % str(echo_enabled_mirage_fp_ids_count))

    # Check which Mirage features profile do not have has a
    # settings.FULL_DURATION features profile
    mirage_fd_fp_count = 0
    echo_create_fd_fp_for = []
    for validated_mirage_fp_id in echo_enabled_mirage_fp_ids:
        mirage_fd_fp_exists = False
        for row in echo_fp_ids_result:
            if int(row['parent_id']) != int(validated_mirage_fp_id):
                continue
            if int(row['full_duration']) == int(settings.FULL_DURATION):
                mirage_fd_fp_exists = True
                mirage_fd_fp_count += 1
        if not mirage_fd_fp_exists:
            echo_create_fd_fp_for.append(int(validated_mirage_fp_id))

    del echo_enabled_mirage_fp_ids

    logger.info('ionosphere_echo :: there are %s FULL_DURATION features profiles for %s' % (str(mirage_fd_fp_count), base_name))

    echo_create_fd_fp_for_count = len(echo_create_fd_fp_for)
    logger.info('ionosphere_echo :: %s FULL_DURATION features profiles to be created for %s' % (str(echo_create_fd_fp_for_count), base_name))
    echo_created_fp_count = 0

    # @added 20190404 - Bug #2904: Initial Ionosphere echo load and Ionosphere feedback
    #                   Feature #2484: FULL_DURATION feature profiles
    # Rate limit the creation of ionosphere_echo FULL_DURATION features profiles
    # this only effects managing the creation of lots of features profiles if
    # Ionosphere echo is enabled on a Skyline instance with lots of existing
    # features profiles for Mirage metrics.  Only create 5 FULL_DURATION
    # features profiles from the latest Mirage based features profiles, which
    # takes around 10 seconds
    if echo_create_fd_fp_for_count > 5:
        logger.info('ionosphere_echo :: rate limiting and only creating the 5 least FULL_DURATION features profiles for %s' % (base_name))
        # Reverse, newest first, using slicing to produce a reversed copy
        reverse_echo_create_fd_fp_for = echo_create_fd_fp_for[::-1]
        rate_limit_echo_create_fd_fp_for = reverse_echo_create_fd_fp_for[0:5]
        echo_create_fd_fp_for = rate_limit_echo_create_fd_fp_for

    last_created_fp = int(time())
    for mirage_fp_id in echo_create_fd_fp_for:
        fp_timestamp = None
        for row in echo_fp_ids_result:
            if int(row['id']) != int(mirage_fp_id):
                continue
            else:
                fp_timestamp = int(row['anomaly_timestamp'])
                fp_generation = int(row['generation'])
        if not fp_timestamp:
            continue
        if not fp_generation:
            fp_generation = 0

        time_check_now = int(time())
        echo_runtime = time_check_now - echo_started_at
        if echo_runtime >= ionosphere_echo_max_fp_create_time:
            logger.info('ionosphere_echo :: ionosphere_echo running for %s seconds, exiting before IONOSPHERE_ECHO_MAX_FP_CREATE_TIME of %s seconds is breached' % (str(echo_runtime), str(ionosphere_echo_max_fp_create_time)))
            break

        logger.info('ionosphere_echo :: creating FULL_DURATION features profile based on data from fp id %s - %s' % (str(mirage_fp_id), base_name))

        context = 'ionosphere_echo'
        ionosphere_job = 'learn_fp_human'
        generation = fp_generation + 1
        fp_learn = False

        # What is the path of the features profile files
        metric_timeseries_dir = base_name.replace('.', '/')
        metric_fp_data_dir = '%s/%s/%s' % (
            settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir,
            str(fp_timestamp))

        # What is the path of the new training data dir to copy the files to
        # and create it
        created_ts = int(time())
        # Ensure features profile creation timestamps do not overlap
        if last_created_fp == created_ts:
            sleep(1)
            created_ts = int(time())
            last_created_fp = created_ts

        metric_training_data_dir = '%s/%s/%s' % (
            settings.IONOSPHERE_DATA_FOLDER, str(created_ts), metric_timeseries_dir)
        if not os.path.exists(metric_training_data_dir):
            try:
                mkdir_p(metric_training_data_dir)
                logger.info('ionosphere_echo :: training data dir created - %s' % metric_training_data_dir)
            except:
                logger.error('error :: ionosphere_echo :: failed to create training data dir - %s' % metric_training_data_dir)
                continue

        if not os.path.isdir(metric_fp_data_dir):
            logger.error('error :: ionosphere_echo :: features profile data dir does not exist - %s' % metric_fp_data_dir)
            continue

        data_files = []
        try:
            glob_path = '%s/*.*' % metric_fp_data_dir
            data_files = glob.glob(glob_path)
        except:
            trace = traceback.format_exc()
            logger.error('%s' % trace)
            logger.error('error :: ionosphere_echo :: glob could not read - %s' % metric_fp_data_dir)

        # Make a list of the files to copy
        copy_files = []
        for i_file in data_files:
            # Exclude all the file resources related to the Mirage
            # full_duration features profile
            if 'matched.fp_id' in i_file:
                continue
            if 'fp.created.txt' in i_file:
                continue
            if 'fp.details.txt' in i_file:
                continue
            if 'csv.features.transposed.csv' in i_file:
                continue
            # Specifically include the required files
            if 'graphite_now' in i_file:
                copy_files.append(i_file)
            echo_metric_txt = '%s.txt' % base_name
            if echo_metric_txt in i_file:
                copy_files.append(i_file)
            echo_metric_json = '%s.json' % base_name
            if echo_metric_json in i_file:
                copy_files.append(i_file)
            if 'mirage.graphite' in i_file:
                copy_files.append(i_file)
            if 'mirage.redis' in i_file:
                copy_files.append(i_file)

        # Copy the required files to the new training_data dir
        for i_file in copy_files:
            copying_filename = os.path.basename(i_file)
            dest_file = '%s/%s' % (metric_training_data_dir, copying_filename)
            if not os.path.isfile(dest_file):
                try:
                    shutil.copy(i_file, metric_training_data_dir)
                    # logger.info('ionosphere_echo ::  training data copied - %s' % (i_file))
                except shutil.Error as e:
                    trace = traceback.format_exc()
                    logger.error('%s' % trace)
                    logger.error('error :: ionosphere_echo ::  shutil error - training data not copied to %s' % metric_training_data_dir)
                    logger.error('error :: ionosphere_echo ::  %s' % (e))
                # Any error saying that the directory doesn't exist
                except OSError as e:
                    trace = traceback.format_exc()
                    logger.error('%s' % trace)
                    logger.error('error :: ionosphere_echo ::  OSError error - training data not copied to %s' % metric_training_data_dir)
                    logger.error('error :: %s' % (e))

        calculated_feature_file = '%s/%s.tsfresh.input.csv.features.transposed.csv' % (metric_training_data_dir, base_name)
        calculated_feature_file_found = False
        fp_csv = None
        if os.path.isfile(calculated_feature_file):
            calculated_feature_file_found = True
            fp_csv = calculated_feature_file
            logger.info('ionosphere_echo :: calculated features file is available - %s' % (calculated_feature_file))

        echo_json_file = '%s.mirage.redis.%sh.json' % (base_name, str(full_duration_in_hours))
        if not calculated_feature_file_found:
            logger.info('ionosphere_echo :: calculating features from mirage.redis data ts json - %s' % (echo_json_file))
            str_created_ts = str(created_ts)
            try:
                fp_csv, successful, fp_exists, fp_id, log_msg, traceback_format_exc, f_calc = calculate_features_profile(skyline_app, str_created_ts, base_name, context)
            except:
                logger.error(traceback.format_exc())
                logger.error('error :: ionosphere_echo :: failed to calculate features')
                continue
        else:
            logger.info('ionosphere_echo :: using available calculated features file')

        if os.path.isfile(calculated_feature_file):
            logger.info('ionosphere_echo :: calculated features - %s' % (calculated_feature_file))
        else:
            logger.error('error :: ionosphere_echo :: failed to calculate features no file found - %s' % calculated_feature_file)
            continue

        # Create the new settings.FULL_DURATION features profile
        ionosphere_job = 'learn_fp_human'

        # @added 20190503 - Branch #2646: slack
        # Added slack_ionosphere_job
        slack_ionosphere_job = ionosphere_job

        fp_learn = False
        try:
            # @modified 20190503 - Branch #2646: slack
            # Added slack_ionosphere_job
            # fp_id, fp_in_successful, fp_exists, fail_msg, traceback_format_exc = create_features_profile(skyline_app, created_ts, base_name, context, ionosphere_job, mirage_fp_id, generation, fp_learn)
            # @modified 20190922 - Feature #3230: users DB table
            #                      Ideas #2476: Label and relate anomalies
            #                      Feature #2516: Add label to features profile
            # Added user_id
            # fp_id, fp_in_successful, fp_exists, fail_msg, traceback_format_exc = create_features_profile(skyline_app, created_ts, base_name, context, ionosphere_job, mirage_fp_id, generation, fp_learn, slack_ionosphere_job, user_id)
            user_id = 1
            # @modified 20191030 - Feature #2516: Add label to features profile
            # Added missing label
            label = 'echo'

            fp_id, fp_in_successful, fp_exists, fail_msg, traceback_format_exc = create_features_profile(skyline_app, created_ts, base_name, context, ionosphere_job, mirage_fp_id, generation, fp_learn, slack_ionosphere_job, user_id, label)
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: ionosphere_echo :: failed to create a settings.FULL_DURATION features profile from fp_id %s for %s' % (str(mirage_fp_id), base_name))
            continue
        if not fp_in_successful:
            logger.error(traceback.format_exc())
            logger.error('error :: ionosphere_echo :: create_features_profile failed')
            continue
        else:
            echo_created_fp_count += 1
            logger.info(
                'ionosphere_echo :: new generation %s features profile with id %s settings.FULL_DURATION created from parent feature profile with id %s' % (
                    str(generation), str(fp_id), str(mirage_fp_id)))

    del echo_create_fd_fp_for
    if engine:
        echo_engine_disposal(engine)
    logger.info('ionosphere_echo :: created %s of %s FULL_DURATION features profile that were to be created for %s' % (str(echo_created_fp_count), str(echo_create_fd_fp_for_count), base_name))

    return
示例#9
0
    def run(self):
        """
        Called when the process intializes.

        Determine if what is known in the Skyline DB
        blah

        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                logger.info('removing %s' % skyline_app_logwait)
                os.remove(skyline_app_logwait)
            except OSError:
                logger.error('error :: failed to remove %s, continuing' % skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error('error :: bin/%s.d log management seems to have failed, continuing' % skyline_app)
            try:
                os.remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error :: failed to remove %s, continuing' % skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        # See if I am known in the DB, if so, what are my variables
        # self.populate mysql
        # What is my host id in the Skyline panorama DB?
        #   - if not known - INSERT hostname INTO hosts
        # What are the known apps?
        #   - if returned make a dictionary
        # What are the known algorithms?
        #   - if returned make a dictionary

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
                if ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: connected to Redis')
            except:
                logger.error('error :: cannot connect to redis at socket path %s' % (
                    settings.REDIS_SOCKET_PATH))
                sleep(30)
                self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Report app up
            try:
                self.redis_conn.setex(skyline_app, 120, now)
                logger.info('updated Redis key for %s up' % skyline_app)
            except:
                logger.error('error :: failed to update Redis key for %s up' % skyline_app)

            if ENABLE_PANORAMA_DEBUG:
                # Make sure mysql is available
                mysql_down = True
                while mysql_down:

                    query = 'SHOW TABLES'
                    results = self.mysql_select(query)

                    if results:
                        mysql_down = False
                        logger.info('debug :: tested database query - OK')
                    else:
                        logger.error('error :: failed to query database')
                        sleep(30)

            if ENABLE_PANORAMA_DEBUG:
                try:
                    query = 'SELECT id, test FROM test'
                    result = self.mysql_select(query)
                    logger.info('debug :: tested mysql SELECT query - OK')
                    logger.info('debug :: result: %s' % str(result))
                    logger.info('debug :: result[0]: %s' % str(result[0]))
                    logger.info('debug :: result[1]: %s' % str(result[1]))
# Works
# 2016-06-10 19:07:23 :: 4707 :: result: [(1, u'test1')]
                except:
                    logger.error(
                        'error :: mysql error - %s' %
                        traceback.print_exc())
                    logger.error('error :: failed to SELECT')

            # self.populate the database metatdata tables
            # What is my host id in the Skyline panorama DB?
            host_id = False
            # @modified 20170913 - Task #2160: Test skyline with bandit
            # Added nosec to exclude from bandit tests
            query = 'select id FROM hosts WHERE host=\'%s\'' % this_host  # nosec
            results = self.mysql_select(query)
            if results:
                host_id = results[0][0]
                logger.info('host_id: %s' % str(host_id))
            else:
                logger.info('failed to determine host id of %s' % this_host)

            #   - if not known - INSERT hostname INTO host
            if not host_id:
                logger.info('inserting %s into hosts table' % this_host)
                # @modified 20170913 - Task #2160: Test skyline with bandit
                # Added nosec to exclude from bandit tests
                query = 'insert into hosts (host) VALUES (\'%s\')' % this_host  # nosec
                host_id = self.mysql_insert(query)
                if host_id:
                    logger.info('new host_id: %s' % str(host_id))

            if not host_id:
                logger.error(
                    'error :: failed to determine populate %s into the hosts table' %
                    this_host)
                sleep(30)
                continue

            # Like loop through the panorama dir and see if anyone has left you
            # any work, etc
            # Make sure check_dir exists and has not been removed
            try:
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: checking check dir exists - %s' % settings.PANORAMA_CHECK_PATH)
                os.path.exists(settings.PANORAMA_CHECK_PATH)
            except:
                logger.error('error :: check dir did not exist - %s' % settings.PANORAMA_CHECK_PATH)
                mkdir_p(settings.PANORAMA_CHECK_PATH)

                logger.info('check dir created - %s' % settings.PANORAMA_CHECK_PATH)
                os.path.exists(settings.PANORAMA_CHECK_PATH)
                # continue

            """
            Determine if any metric has been added to add
            """
            while True:
                metric_var_files = False
                try:
                    metric_var_files = [f for f in listdir(settings.PANORAMA_CHECK_PATH) if isfile(join(settings.PANORAMA_CHECK_PATH, f))]
                except:
                    logger.error('error :: failed to list files in check dir')
                    logger.info(traceback.format_exc())

                if not metric_var_files:
                    logger.info('sleeping 20 no metric check files')
                    sleep(20)

                # Discover metric anomalies to insert
                metric_var_files = False
                try:
                    metric_var_files = [f for f in listdir(settings.PANORAMA_CHECK_PATH) if isfile(join(settings.PANORAMA_CHECK_PATH, f))]
                except:
                    logger.error('error :: failed to list files in check dir')
                    logger.info(traceback.format_exc())

                if metric_var_files:
                    break

            metric_var_files_sorted = sorted(metric_var_files)
            metric_check_file = '%s/%s' % (settings.PANORAMA_CHECK_PATH, str(metric_var_files_sorted[0]))

            logger.info('assigning anomaly for insertion - %s' % str(metric_var_files_sorted[0]))

            # Spawn processes
            pids = []
            spawned_pids = []
            pid_count = 0
            now = time()
            for i in range(1, settings.PANORAMA_PROCESSES + 1):
                try:
                    p = Process(target=self.spin_process, args=(i, metric_check_file))
                    pids.append(p)
                    pid_count += 1
                    logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.PANORAMA_PROCESSES)))
                    p.start()
                    spawned_pids.append(p.pid)
                except:
                    logger.error('error :: to start spin_process')
                    logger.info(traceback.format_exc())
                    continue

            # Send wait signal to zombie processes
            # for p in pids:
            #     p.join()
            # Self monitor processes and terminate if any spin_process has run
            # for longer than CRUCIBLE_TESTS_TIMEOUT
            p_starts = time()
            while time() - p_starts <= 20:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info(
                        '%s :: %s spin_process/es completed in %.2f seconds' % (
                            skyline_app, str(settings.PANORAMA_PROCESSES),
                            time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app))
                for p in pids:
                    p.terminate()
                    # p.join()

                check_file_name = os.path.basename(str(metric_check_file))
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: check_file_name - %s' % check_file_name)
                check_file_timestamp = check_file_name.split('.', 1)[0]
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: check_file_timestamp - %s' % str(check_file_timestamp))
                check_file_metricname_txt = check_file_name.split('.', 1)[1]
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: check_file_metricname_txt - %s' % check_file_metricname_txt)
                check_file_metricname = check_file_metricname_txt.replace('.txt', '')
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: check_file_metricname - %s' % check_file_metricname)
                check_file_metricname_dir = check_file_metricname.replace('.', '/')
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: check_file_metricname_dir - %s' % check_file_metricname_dir)

                metric_failed_check_dir = '%s/%s/%s' % (failed_checks_dir, check_file_metricname_dir, check_file_timestamp)

                fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))

            for p in pids:
                if p.is_alive():
                    logger.info('%s :: stopping spin_process - %s' % (skyline_app, str(p.is_alive())))
                    p.join()
示例#10
0
def create_features_profile(current_skyline_app, requested_timestamp, data_for_metric, context, ionosphere_job, fp_parent_id, fp_generation, fp_learn):
    """
    Add a features_profile to the Skyline ionosphere database table.

    :param current_skyline_app: Skyline app name
    :param requested_timestamp: The timestamp of the dir that the features
        profile data is in
    :param data_for_metric: The base_name of the metric
    :param context: The context of the caller
    :param ionosphere_job: The ionosphere_job name related to creation request
        valid jobs are ``learn_fp_human``, ``learn_fp_generation``,
        ``learn_fp_learnt`` and ``learn_fp_automatic``.
    :param fp_parent_id: The id of the parent features profile that this was
        learnt from, 0 being an original human generated features profile
    :param fp_generation: The number of generations away for the original
        human generated features profile, 0 being an original human generated
        features profile.
    :param fp_learn: Whether Ionosphere should learn at use_full_duration_days
    :type current_skyline_app: str
    :type requested_timestamp: int
    :type data_for_metric: str
    :type context: str
    :type ionosphere_job: str
    :type fp_parent_id: int
    :type fp_generation: int
    :type fp_learn: boolean
    :return: fp_id, fp_in_successful, fp_exists, fail_msg, traceback_format_exc
    :rtype: str, boolean, boolean, str, str

    """

    current_skyline_app_logger = current_skyline_app + 'Log'
    current_logger = logging.getLogger(current_skyline_app_logger)

    base_name = data_for_metric.replace(settings.FULL_NAMESPACE, '', 1)

    if context == 'training_data':
        log_context = 'training data'
        ionosphere_learn_job = 'learn_fp_human'
    if context == 'features_profiles':
        log_context = 'features profile data'

    # @added 20170113 - Feature #1854: Ionosphere learn
    if context == 'ionosphere_learn':
        log_context = 'learn'

    current_logger.info('create_features_profile :: %s :: requested for %s at %s' % (
        context, str(base_name), str(requested_timestamp)))

    metric_timeseries_dir = base_name.replace('.', '/')
    if context == 'training_data':
        metric_training_data_dir = '%s/%s/%s' % (
            settings.IONOSPHERE_DATA_FOLDER, str(requested_timestamp),
            metric_timeseries_dir)
    if context == 'features_profiles':
        metric_training_data_dir = '%s/%s/%s' % (
            settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir,
            str(requested_timestamp))

    # @added 20170113 - Feature #1854: Ionosphere learn
    if context == 'ionosphere_learn':
        # @modified 20170116 - Feature #1854: Ionosphere learn
        # Allowing ionosphere_learn to create a features profile for a training
        # data set that it has learnt is not anomalous
        if ionosphere_job != 'learn_fp_automatic':
            metric_training_data_dir = '%s/%s/%s' % (
                settings.IONOSPHERE_LEARN_FOLDER, str(requested_timestamp),
                metric_timeseries_dir)
        else:
            metric_training_data_dir = '%s/%s/%s' % (
                settings.IONOSPHERE_DATA_FOLDER, str(requested_timestamp),
                metric_timeseries_dir)

    features_file = '%s/%s.tsfresh.input.csv.features.transposed.csv' % (
        metric_training_data_dir, base_name)

    features_profile_dir = '%s/%s' % (
        settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir)

    ts_features_profile_dir = '%s/%s/%s' % (
        settings.IONOSPHERE_PROFILES_FOLDER, metric_timeseries_dir,
        str(requested_timestamp))

    features_profile_created_file = '%s/%s.%s.fp.created.txt' % (
        metric_training_data_dir, str(requested_timestamp), base_name)

    features_profile_details_file = '%s/%s.%s.fp.details.txt' % (
        metric_training_data_dir, str(requested_timestamp), base_name)

    anomaly_check_file = '%s/%s.txt' % (metric_training_data_dir, base_name)

    trace = 'none'
    fail_msg = 'none'
    new_fp_id = False
    calculated_with_tsfresh = False
    calculated_time = False
    fcount = None
    fsum = None
    # @added 20170104 - Feature #1842: Ionosphere - Graphite now graphs
    # Added the ts_full_duration parameter so that the appropriate graphs can be
    # embedded for the user in the training data page
    ts_full_duration = '0'

    if context == 'ionosphere_learn':
        if not path.isfile(features_profile_details_file):
            current_logger.error('error :: create_features_profile :: no features_profile_details_file - %s' % features_profile_details_file)
            return 'none', False, False, fail_msg, trace

    if path.isfile(features_profile_details_file):
        current_logger.info('create_features_profile :: getting features profile details from from - %s' % features_profile_details_file)
        # Read the details file
        with open(features_profile_details_file, 'r') as f:
            fp_details_str = f.read()
        fp_details = literal_eval(fp_details_str)
        calculated_with_tsfresh = fp_details[1]
        calculated_time = str(fp_details[2])
        fcount = str(fp_details[3])
        fsum = str(fp_details[4])
        try:
            ts_full_duration = str(fp_details[5])
        except:
            current_logger.error('error :: create_features_profile :: could not determine the full duration from - %s' % features_profile_details_file)
            ts_full_duration = '0'

        if context != 'ionosphere_learn':
            if ts_full_duration == '0':
                if path.isfile(anomaly_check_file):
                    current_logger.info('create_features_profile :: determining the full duration from anomaly_check_file - %s' % anomaly_check_file)
                    # Read the details file
                    with open(anomaly_check_file, 'r') as f:
                        anomaly_details = f.readlines()
                        for i, line in enumerate(anomaly_details):
                            if 'full_duration' in line:
                                _ts_full_duration = '%s' % str(line).split("'", 2)
                                full_duration_array = literal_eval(_ts_full_duration)
                                ts_full_duration = str(int(full_duration_array[1]))
                                current_logger.info('create_features_profile :: determined the full duration as - %s' % str(ts_full_duration))

    if path.isfile(features_profile_created_file):
        # Read the created file
        with open(features_profile_created_file, 'r') as f:
            fp_created_str = f.read()
        fp_created = literal_eval(fp_created_str)
        new_fp_id = fp_created[0]

        return str(new_fp_id), True, True, fail_msg, trace

    # Have data
    if path.isfile(features_file):
        current_logger.info('create_features_profile :: features_file exists: %s' % features_file)
    else:
        trace = traceback.format_exc()
        current_logger.error(trace)
        fail_msg = 'error :: create_features_profile :: features_file does not exist: %s' % features_file
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            return False, False, False, fail_msg, trace

    features_data = []
    with open(features_file, 'rb') as fr:
        reader = csv.reader(fr, delimiter=',')
        for i, line in enumerate(reader):
            feature_name_item = False
            fname_id = False
            f_value = False
            feature_name = str(line[0])
            feature_name_item = filter(
                lambda x: x[1] == feature_name, TSFRESH_FEATURES)
            if feature_name_item:
                feature_name_id = feature_name_item[0]
            if feature_name_item:
                feature_name_list = feature_name_item[0]
                fname_id = int(feature_name_list[0])
            f_value = str(line[1])
            if fname_id and f_value:
                features_data.append([fname_id, f_value])

    # @added 20170113 - Feature #1854: Ionosphere learn - generations
    # Set the learn generations variables with the IONOSPHERE_LEARN_DEFAULT_ and any
    # settings.IONOSPHERE_LEARN_NAMESPACE_CONFIG values.  These will later be
    # overridden by any database values determined for the specific metric if
    # they exist.
    # Set defaults
    use_full_duration_days = int(settings.IONOSPHERE_LEARN_DEFAULT_FULL_DURATION_DAYS)
    valid_learning_duration = int(settings.IONOSPHERE_LEARN_DEFAULT_VALID_TIMESERIES_OLDER_THAN_SECONDS)
    max_generations = int(settings.IONOSPHERE_LEARN_DEFAULT_MAX_GENERATIONS)
    max_percent_diff_from_origin = float(settings.IONOSPHERE_LEARN_DEFAULT_MAX_PERCENT_DIFF_FROM_ORIGIN)
    try:
        use_full_duration, valid_learning_duration, use_full_duration_days, max_generations, max_percent_diff_from_origin = get_ionosphere_learn_details(current_skyline_app, base_name)
        learn_full_duration_days = use_full_duration_days
    except:
        current_logger.error(traceback.format_exc())
        current_logger.error('error :: create_features_profile :: failed to get_ionosphere_learn_details')

    current_logger.info('create_features_profile :: learn_full_duration_days     :: %s days' % (str(learn_full_duration_days)))
    current_logger.info('create_features_profile :: valid_learning_duration      :: %s seconds' % (str(valid_learning_duration)))
    current_logger.info('create_features_profile :: max_generations              :: %s' % (str(max_generations)))
    current_logger.info('create_features_profile :: max_percent_diff_from_origin :: %s' % (str(max_percent_diff_from_origin)))

    current_logger.info('create_features_profile :: getting MySQL engine')
    try:
        engine, fail_msg, trace = fp_create_get_an_engine(current_skyline_app)
        current_logger.info(fail_msg)
    except:
        trace = traceback.format_exc()
        current_logger.error(trace)
        fail_msg = 'error :: create_features_profile :: could not get a MySQL engine'
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            return False, False, False, fail_msg, trace

    if not engine:
        trace = 'none'
        fail_msg = 'error :: create_features_profile :: engine not obtained'
        current_logger.error(fail_msg)
        if context == 'training' or context == 'features_profile':
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            return False, False, False, fail_msg, trace

    # Get metric details from the database
    metrics_id = False

    # Use the learn details as per config
    metric_learn_full_duration_days = int(use_full_duration_days)
    metric_learn_valid_ts_older_than = int(valid_learning_duration)
    metric_max_generations = int(max_generations)
    metric_max_percent_diff_from_origin = int(max_percent_diff_from_origin)

    metrics_table = None
    try:
        metrics_table, fail_msg, trace = metrics_table_meta(current_skyline_app, engine)
        current_logger.info(fail_msg)
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to get metrics_table meta for %s' % base_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: disposing of any engine')
            fp_create_engine_disposal(current_skyline_app, engine)
            return False, False, False, fail_msg, trace

    current_logger.info('create_features_profile :: metrics_table OK')

    metric_db_object = None
    try:
        connection = engine.connect()
        # @modified 20161209 -  - Branch #922: ionosphere
        #                        Task #1658: Patterning Skyline Ionosphere
        # result = connection.execute('select id from metrics where metric=\'%s\'' % base_name)
#        for row in result:
#            while not metrics_id:
#                metrics_id = row['id']
        stmt = select([metrics_table]).where(metrics_table.c.metric == base_name)
        result = connection.execute(stmt)
        for row in result:
            metrics_id = row['id']
            # @added 20170113 - Feature #1854: Ionosphere learn - generations
            # Added Ionosphere LEARN generation related variables
            try:
                metric_learn_full_duration_days = int(row['learn_full_duration_days'])
                metric_learn_valid_ts_older_than = int(row['learn_valid_ts_older_than'])
                metric_max_generations = int(row['max_generations'])
                metric_max_percent_diff_from_origin = float(row['max_percent_diff_from_origin'])
            except:
                current_logger.error('error :: create_features_profile :: failed to determine learn related values from DB for %s' % base_name)
        row = result.fetchone()
        # metric_db_object = row
        connection.close()
        current_logger.info('create_features_profile :: determined db metric id: %s' % str(metrics_id))
        current_logger.info('create_features_profile :: determined db metric learn_full_duration_days: %s' % str(metric_learn_full_duration_days))
        current_logger.info('create_features_profile :: determined db metric learn_valid_ts_older_than: %s' % str(metric_learn_valid_ts_older_than))
        current_logger.info('create_features_profile :: determined db metric max_generations: %s' % str(metric_max_generations))
        current_logger.info('create_features_profile :: determined db metric max_percent_diff_from_origin: %s' % str(metric_max_percent_diff_from_origin))
    except:
        trace = traceback.format_exc()
        current_logger.error(trace)
        fail_msg = 'error :: create_features_profile :: could not determine id of metric from DB: %s' % base_name
        current_logger.error('%s' % fail_msg)

    if metric_learn_full_duration_days:
        learn_full_duration_days = metric_learn_full_duration_days
        # learn_full_duration = int(learn_full_duration_days) * 86400
    if metric_learn_valid_ts_older_than:
        learn_valid_ts_older_than = metric_learn_valid_ts_older_than
    if metric_max_generations:
        max_generations = metric_max_generations
    if metric_max_percent_diff_from_origin:
        max_percent_diff_from_origin = metric_max_percent_diff_from_origin
    current_logger.info('create_features_profile :: generation info - learn_full_duration_days     :: %s' % (str(learn_full_duration_days)))
    current_logger.info('create_features_profile :: generation info - learn_valid_ts_older_than    :: %s' % (str(learn_valid_ts_older_than)))
    current_logger.info('create_features_profile :: generation info - max_generations              :: %s' % (str(max_generations)))
    current_logger.info('create_features_profile :: generation info - max_percent_diff_from_origin :: %s' % (str(max_percent_diff_from_origin)))

    # @added 20170120 - Feature #1854: Ionosphere learn
    # Always use the timestamp from the anomaly file
    use_anomaly_timestamp = int(requested_timestamp)
    if context == 'ionosphere_learn':
        if path.isfile(anomaly_check_file):
            current_logger.info('create_features_profile :: determining the full duration from anomaly_check_file - %s' % anomaly_check_file)
            # Read the details file
            with open(anomaly_check_file, 'r') as f:
                anomaly_details = f.readlines()
                for i, line in enumerate(anomaly_details):
                    if 'metric_timestamp' in line:
                        _metric_timestamp = '%s' % str(line).split("'", 2)
                        metric_timestamp_array = literal_eval(_metric_timestamp)
                        use_anomaly_timestamp = (int(metric_timestamp_array[1]))
                        current_logger.info('create_features_profile :: determined the anomaly metric_timestamp as - %s' % str(use_anomaly_timestamp))

    ionosphere_table = None
    try:
        ionosphere_table, fail_msg, trace = ionosphere_table_meta(current_skyline_app, engine)
        current_logger.info(fail_msg)
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to get ionosphere_table meta for %s' % base_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # Raise to webbapp I believe to provide traceback to user in UI
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            raise
        else:
            current_logger.info('create_features_profile :: disposing of any engine')
            fp_create_engine_disposal(current_skyline_app, engine)
            return False, False, False, fail_msg, trace

    current_logger.info('create_features_profile :: ionosphere_table OK')

    # @added 20170403 - Feature #2000: Ionosphere - validated
    # Set all learn_fp_human features profiles to validated.
    fp_validated = 0
    if ionosphere_job == 'learn_fp_human':
        fp_validated = 1

    # @added 20170424 - Feature #2000: Ionosphere - validated
    # Set all generation 0 and 1 as validated
    if int(fp_generation) <= 1:
        fp_validated = 1

    new_fp_id = False
    try:
        connection = engine.connect()
        # @added 20170113 - Feature #1854: Ionosphere learn
        # Added learn values parent_id, generation
        # @modified 20170120 - Feature #1854: Ionosphere learn
        # Added anomaly_timestamp
        # @modified 20170403 - Feature #2000: Ionosphere - validated
        ins = ionosphere_table.insert().values(
            metric_id=int(metrics_id), full_duration=int(ts_full_duration),
            anomaly_timestamp=int(use_anomaly_timestamp),
            enabled=1, tsfresh_version=str(tsfresh_version),
            calc_time=calculated_time, features_count=fcount,
            features_sum=fsum, parent_id=fp_parent_id,
            generation=fp_generation, validated=fp_validated)
        result = connection.execute(ins)
        connection.close()
        new_fp_id = result.inserted_primary_key[0]
        current_logger.info('create_features_profile :: new ionosphere fp_id: %s' % str(new_fp_id))
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to insert a new record into the ionosphere table for %s' % base_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: disposing of any engine')
            fp_create_engine_disposal(current_skyline_app, engine)
            return False, False, False, fail_msg, trace

    if not RepresentsInt(new_fp_id):
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: unknown new ionosphere new_fp_id for %s' % base_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: disposing of any engine')
            fp_create_engine_disposal(current_skyline_app, engine)
            return False, False, False, fail_msg, trace

    # Create z_fp_<metric_id> table
    fp_table_created = False
    fp_table_name = 'z_fp_%s' % str(metrics_id)
    try:
        fp_meta = MetaData()
        # @modified 20161222 - Task #1812: z_fp table type
        # Changed to InnoDB from MyISAM as no files open issues and MyISAM clean
        # up, there can be LOTS of file_per_table z_fp_ tables/files without
        # the MyISAM issues.  z_fp_ tables are mostly read and will be shuffled
        # in the table cache as required.
        fp_metric_table = Table(
            fp_table_name, fp_meta,
            Column('id', Integer, primary_key=True),
            Column('fp_id', Integer, nullable=False, key='fp_id'),
            Column('feature_id', Integer, nullable=False),
            Column('value', DOUBLE(), nullable=True),
            mysql_charset='utf8',
            mysql_key_block_size='255',
            mysql_engine='InnoDB')
        fp_metric_table.create(engine, checkfirst=True)
        fp_table_created = True
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to create table - %s' % fp_table_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context)

    if not fp_table_created:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to determine True for create table - %s' % fp_table_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context)

    # Insert features and values
    insert_statement = []
    for fname_id, f_value in features_data:
        insert_statement.append({'fp_id': new_fp_id, 'feature_id': fname_id, 'value': f_value},)
    if insert_statement == []:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: empty insert_statement for %s inserts' % fp_table_name
        current_logger.error('%s' % fail_msg)
        # raise
    # else:
        # feature_count = sum(1 for x in a if isinstance(x, insert_statement))
        # current_logger.info(
        #     'fp_id - %s - %s feature values in insert_statement for %s ' %
        #     (str(feature_count), str(new_fp_id), fp_table_name))
        # feature_count = sum(1 for x in a if isinstance(x, insert_statement))
        # current_logger.info(
        #     'fp_id - %s - feature values in insert_statement for %s ' %
        #     (str(new_fp_id), fp_table_name))

    try:
        connection = engine.connect()
        connection.execute(fp_metric_table.insert(), insert_statement)
        connection.close()
        current_logger.info('create_features_profile :: fp_id - %s - feature values inserted into %s' % (str(new_fp_id), fp_table_name))
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to insert a feature values into %s' % fp_table_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context)

    # Create metric ts table if not exists ts_<metric_id>
    # Create z_ts_<metric_id> table
    # @modified 20170121 - Feature #1854: Ionosphere learn - generations
    # TODO Adding the option to not save timeseries to DB, as default?
    # ts_table_created = False
    ts_table_name = 'z_ts_%s' % str(metrics_id)
    try:
        ts_meta = MetaData()
        # @modified 20161222 - Task #1812: z_fp table type
        # Changed to InnoDB from MyISAM as no files open issues and MyISAM clean
        # up, there can be LOTS of file_per_table z_fp_ tables/files without
        # the MyISAM issues.  z_fp_ tables are mostly read and will be shuffled
        # in the table cache as required.
        ts_metric_table = Table(
            ts_table_name, ts_meta,
            Column('id', Integer, primary_key=True),
            Column('fp_id', Integer, nullable=False, key='fp_id'),
            Column('timestamp', Integer, nullable=False),
            Column('value', DOUBLE(), nullable=True),
            mysql_charset='utf8',
            mysql_key_block_size='255',
            mysql_engine='InnoDB')
        ts_metric_table.create(engine, checkfirst=True)
        # ts_table_created = True
        current_logger.info('create_features_profile :: metric ts table created OK - %s' % (ts_table_name))
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to create table - %s' % ts_table_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            # Raise to webbapp I believe to provide traceback to user in UI
            raise
        else:
            current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context)

    # Insert timeseries that the features profile was created from
    raw_timeseries = []
    anomaly_json = '%s/%s.json' % (metric_training_data_dir, base_name)
    if path.isfile(anomaly_json):
        current_logger.info('create_features_profile :: metric anomaly json found OK - %s' % (anomaly_json))
        try:
            # Read the timeseries json file
            with open(anomaly_json, 'r') as f:
                raw_timeseries = f.read()
        except:
            trace = traceback.format_exc()
            current_logger.error(trace)
            fail_msg = 'error :: create_features_profile :: failed to read timeseries data from %s' % anomaly_json
            current_logger.error('%s' % (fail_msg))
            fail_msg = 'error: failed to read timeseries data from %s' % anomaly_json
            # end = timer()
            if context == 'training' or context == 'features_profile':
                # @added 20170806 - Bug #2130: MySQL - Aborted_clients
                # Added missing disposal
                if engine:
                    fp_create_engine_disposal(current_skyline_app, engine)
                # Raise to webbapp I believe to provide traceback to user in UI
                raise
    else:
        trace = 'none'
        fail_msg = 'error: file not found - %s' % (anomaly_json)
        current_logger.error(fail_msg)
        # raise

    # Convert the timeseries to csv
    timeseries_array_str = str(raw_timeseries).replace('(', '[').replace(')', ']')
    timeseries = literal_eval(timeseries_array_str)

    datapoints = timeseries
    validated_timeseries = []
    for datapoint in datapoints:
        try:
            new_datapoint = [str(int(datapoint[0])), float(datapoint[1])]
            validated_timeseries.append(new_datapoint)
        # @modified 20170913 - Task #2160: Test skyline with bandit
        # Added nosec to exclude from bandit tests
        except:  # nosec
            continue

    insert_statement = []
    for ts, value in validated_timeseries:
        insert_statement.append({'fp_id': new_fp_id, 'timestamp': ts, 'value': value},)
    try:
        connection = engine.connect()
        connection.execute(ts_metric_table.insert(), insert_statement)
        connection.close()
        current_logger.info('create_features_profile :: fp_id - %s - timeseries inserted into %s' % (str(new_fp_id), ts_table_name))
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to insert the timeseries into %s' % ts_table_name
        current_logger.error('%s' % fail_msg)
        if context == 'training' or context == 'features_profile':
            # @added 20170806 - Bug #2130: MySQL - Aborted_clients
            # Added missing disposal
            if engine:
                fp_create_engine_disposal(current_skyline_app, engine)
            raise
        else:
            current_logger.info('create_features_profile :: %s - automated so the table should exists continuing' % context)

    # Create a created features profile file
    try:
        # data = '[%s, %s, ]' % (new_fp_id, str(int(time.time())))
        # write_data_to_file(skyline_app, features_profile_created_file, 'w', data)
        # @modified 20170115 -  Feature #1854: Ionosphere learn - generations
        # Added parent_id and generation
        data = '[%s, %s, \'%s\', %s, %s, %s, %s, %s, %s]' % (
            new_fp_id, str(int(time.time())), str(tsfresh_version),
            str(calculated_time), str(fcount), str(fsum), str(ts_full_duration),
            str(fp_parent_id), str(fp_generation))
        write_data_to_file(current_skyline_app, features_profile_created_file, 'w', data)
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        fail_msg = 'error :: create_features_profile :: failed to write fp.created file'
        current_logger.error('%s' % fail_msg)

    # Set ionosphere_enabled for the metric
    try:
        # update_statement = 'UPDATE metrics SET ionosphere_enabled=1 WHERE id=%s' % str(metrics_id)
        connection = engine.connect()
        # result = connection.execute('UPDATE metrics SET ionosphere_enabled=1 WHERE id=%s' % str(metrics_id))
        # connection.execute(ts_metric_table.insert(), insert_statement)
        connection.execute(
            metrics_table.update(
                metrics_table.c.id == metrics_id).values(ionosphere_enabled=1))
        connection.close()
        current_logger.info('create_features_profile :: ionosphere_enabled set on metric id: %s' % str(metrics_id))
    except:
        trace = traceback.format_exc()
        current_logger.error(trace)
        fail_msg = 'error :: create_features_profile :: could not update metrics table and set ionosphere_enabled on id %s' % str(metrics_id)
        current_logger.error('%s' % fail_msg)
        # raise

    # Copy data from training data dir to features_profiles dir
    if not path.isdir(ts_features_profile_dir):
        mkdir_p(ts_features_profile_dir)

    if path.isdir(ts_features_profile_dir):
        current_logger.info('create_features_profile :: fp_id - %s - features profile dir created - %s' % (str(new_fp_id), ts_features_profile_dir))
        # src_files = os.listdir(src)
        # for file_name in src_files:
        #    full_file_name = path.join(src, file_name)
        #    if (path.isfile(full_file_name)):
        #        shutil.copy(full_file_name, dest)

        data_files = []
        try:
            glob_path = '%s/*.*' % metric_training_data_dir
            data_files = glob.glob(glob_path)
        except:
            trace = traceback.format_exc()
            current_logger.error('%s' % trace)
            current_logger.error('error :: create_features_profile :: glob - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir))

        for i_file in data_files:
            try:
                shutil.copy(i_file, ts_features_profile_dir)
                current_logger.info('create_features_profile :: fp_id - %s - training data copied - %s' % (str(new_fp_id), i_file))
            except shutil.Error as e:
                trace = traceback.format_exc()
                current_logger.error('%s' % trace)
                current_logger.error('error :: create_features_profile :: shutil error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir))
                current_logger.error('error :: create_features_profile :: %s' % (e))
            # Any error saying that the directory doesn't exist
            except OSError as e:
                trace = traceback.format_exc()
                current_logger.error('%s' % trace)
                current_logger.error('error :: create_features_profile :: OSError error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir))
                current_logger.error('error :: create_features_profile :: %s' % (e))
        current_logger.info('create_features_profile :: fp_id - %s - training data copied to %s' % (str(new_fp_id), ts_features_profile_dir))
    else:
        current_logger.error('error :: create_features_profile :: fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir))

    current_logger.info('create_features_profile :: disposing of any engine')
    try:
        if engine:
            fp_create_engine_disposal(current_skyline_app, engine)
        else:
            current_logger.info('create_features_profile :: no engine to dispose of' % (str(new_fp_id), ts_features_profile_dir))
    except:
        trace = traceback.format_exc()
        current_logger.error('%s' % trace)
        current_logger.error('error :: create_features_profile :: OSError error - fp_id - %s - training data not copied to %s' % (str(new_fp_id), ts_features_profile_dir))

    # @added 20170113 - Feature #1854: Ionosphere learn - Redis ionosphere.learn.work namespace
    # Ionosphere learn needs Redis works sets
    # When a features profile is created there needs to be work added to a Redis
    # set. When a human makes a features profile, we want Ionosphere to make a
    # use_full_duration_days features profile valid_learning_duration (e.g.
    # 3361) later.
    if settings.IONOSPHERE_LEARN and new_fp_id:
        create_redis_work_item = False
        if context == 'training_data' and ionosphere_job == 'learn_fp_human':
            create_redis_work_item = True
            # @modified 20170120 -  Feature #1854: Ionosphere learn - generations
            # Added fp_learn parameter to allow the user to not learn the
            # use_full_duration_days
            if not fp_learn:
                create_redis_work_item = False
                current_logger.info('fp_learn is False not adding an item to Redis ionosphere.learn.work set')

        if ionosphere_job == 'learn_fp_automatic':
            create_redis_work_item = True
            # @added 20170131 - Feature #1886 Ionosphere learn - child like parent with evolutionary maturity
            # TODO: here a check may be required to evaluate whether the origin_fp_id
            #       had a use_full_duration features profile created, however
            #       due to the fact that it is in learn, suggests that it did
            #       have, not 100% sure.
            origin_fp_id_was_allowed_to_learn = False
            child_use_full_duration_count_of_origin_fp_id = 1
            # TODO: Determine the state
            # child_use_full_duration_count_of_origin_fp_id = SELECT COUNT(id) FROM ionosphere WHERE parent_id=origin_fp_id AND full_duration=use_full_duration
            if child_use_full_duration_count_of_origin_fp_id == 0:
                current_logger.info('the origin parent was not allowed to learn not adding to Redis ionosphere.learn.work set')
                create_redis_work_item = False

        if create_redis_work_item:
            try:
                current_logger.info(
                    'adding work to Redis ionosphere.learn.work set - [\'Soft\', \'%s\', %s, \'%s\', %s, %s] to make a learn features profile later' % (
                        str(ionosphere_job), str(requested_timestamp), base_name,
                        str(new_fp_id), str(fp_generation)))
                redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                redis_conn.sadd('ionosphere.learn.work', ['Soft', str(ionosphere_job), int(requested_timestamp), base_name, int(new_fp_id), int(fp_generation)])
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: failed adding work to Redis ionosphere.learn.work set - [\'Soft\', \'%s\', %s, \'%s\', %s, %s] to make a learn features profile later' % (
                        str(ionosphere_job), str(requested_timestamp), base_name,
                        str(new_fp_id), str(fp_generation)))

        # @added 20170806 - Bug #2130: MySQL - Aborted_clients
        # Added missing disposal
        if engine:
            fp_create_engine_disposal(current_skyline_app, engine)

    return str(new_fp_id), True, False, fail_msg, trace
示例#11
0
def submit_crucible_job(from_timestamp, until_timestamp, metrics_list,
                        namespaces_list, source, alert_interval, user_id, user,
                        add_to_panorama, pad_timeseries, training_data_json,
                        run_algorithms):
    """
    Get a list of all the metrics passed and generate Crucible check files for
    each

    :param from_timestamp: the timestamp at which to start the time series
    :param until_timestamp: the timestamp at which to end the time series
    :param metrics_list: a list of metric names to analyse
    :param namespaces_list: a list of metric namespaces to analyse
    :param source: the source webapp making the request
    :param alert_interval: the alert_interval at which Crucible should trigger
        anomalies
    :param user_id: the user id of the user making the request
    :param user: the username making the request
    :param add_to_panorama: whether Crucible should add Skyline CONSENSUS
        anomalies to Panorama
    :param pad_timeseries: the amount of data to pad the time series with
    :param training_data_json: the full path to the training_data json file if
        source is training_data
    :param run_algorithms: list of algorithms to run
    :type from_timestamp: int
    :type until_timestamp: int
    :type metrics_list: list
    :type namespaces_list: list
    :type source: str
    :type alert_interval: int
    :type user_id: int
    :type user: str
    :type add_to_panorama: boolean
    :type pad_timeseries: str
    :type training_data_json: str
    :type run_algorithms: list
    :return: tuple of lists
    :rtype:  (list, list, list, list)

    Returns (crucible_job_id, metrics_submitted_to_process, fail_msg, trace)

    """

    fail_msg = None
    trace = None
    crucible_job_id = None
    metrics_submitted_to_process = 0

    # Generate a job id based on the YMDHMS.user_id and a job directory
    try:
        jobid_timestamp = int(time())
        jobid_datetimestamp = dt.datetime.fromtimestamp(
            jobid_timestamp).strftime('%Y%m%d%H%M%S')
        crucible_job_id = '%s.%s' % (str(jobid_datetimestamp), str(user_id))
    except:
        logger.error(traceback.format_exc())
        logger.error('error :: failed to determine a crucible_job_id')
        raise  # to webapp to return in the UI

    # Generate a job id based on the YMDHMS.user_id and a job directory
    try:
        crucible_path = os.path.dirname(settings.CRUCIBLE_DATA_FOLDER)
        crucible_job_dir = '%s/jobs/%s' % (crucible_path, crucible_job_id)
        if not path.exists(crucible_job_dir):
            logger.info('creating crucible job directory - %s' %
                        (str(crucible_job_dir)))
            mkdir_p(crucible_job_dir)
    except:
        trace = traceback.format_exc()
        fail_msg = 'error :: failed to create the crucible job directory'
        logger.error(trace)
        logger.error(fail_msg)
        raise  # to webapp to return in the UI

    # TODO added checks of metric names
    metric_names = []
    if metrics_list:
        logger.info('submit_crucible_job :: %s metrics passed' %
                    str(len(metrics_list)))
        for metric in metrics_list:
            metric_names.append(metric)

    # TODO added checks of metric namespaces, harder to do, but so that the UI
    # errors to the usr rather than sending a bad or non-existent metric to
    # Crucible
    if namespaces_list:
        logger.info('submit_crucible_job :: %s namespaces passed' %
                    str(len(namespaces_list)))
        logger.info(
            'submit_crucible_job :: determine metrics for submit_crucible_job between %s and %s'
            % (str(from_timestamp), str(until_timestamp)))
        logger.info('getting MySQL engine')
        try:
            engine, fail_msg, trace = get_an_engine()
            logger.info(fail_msg)
        except:
            trace = traceback.format_exc()
            logger.error(trace)
            logger.error('%s' % fail_msg)
            logger.error(
                'error :: could not get a MySQL engine to get metric names')
            raise  # to webapp to return in the UI

        if not engine:
            trace = 'none'
            fail_msg = 'error :: engine not obtained'
            logger.error(fail_msg)
            raise

        try:
            metrics_table, log_msg, trace = metrics_table_meta(
                skyline_app, engine)
            logger.info(log_msg)
            logger.info('metrics_table OK')
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to get metrics_table meta')
            if engine:
                engine_disposal(engine)
            raise  # to webapp to return in the UI

        metrics_like_query = text(
            """SELECT metric FROM metrics WHERE metric LIKE :like_string""")
        for namespace in namespaces_list:
            try:
                connection = engine.connect()
                results = connection.execute(metrics_like_query,
                                             like_string=str(namespace))
                connection.close()
                for row in results:
                    metric_name = str(row[0])
                    metric_names.append(metric_name)
            except:
                trace = traceback.format_exc()
                logger.error(trace)
                logger.error(
                    'error :: could not determine metrics from metrics table')
                if engine:
                    engine_disposal(engine)
                raise
        logger.info(
            'submit_crucible_job :: %s metrics determined from passed namespaces'
            % str(len(metric_names)))

    logger.info('submit_crucible_job :: %s metrics to process' %
                str(len(metric_names)))
    metrics_submitted_to_process = []
    datapoint = 0
    triggered_algorithms = [
        'histogram_bins', 'first_hour_average', 'stddev_from_average',
        'grubbs', 'ks_test', 'mean_subtraction_cumulation',
        'median_absolute_deviation', 'stddev_from_moving_average',
        'least_squares'
    ]
    added_at = int(time())
    for base_name in metric_names:
        sane_metricname = filesafe_metricname(str(base_name))
        derivative_metric = is_derivative_metric(skyline_app, base_name)
        if derivative_metric:
            target = 'nonNegativeDerivative(%s)' % base_name
        else:
            target = base_name
        # Generate a metric job directory
        crucible_anomaly_dir = '%s/%s' % (crucible_job_dir, sane_metricname)
        try:
            if not path.exists(crucible_anomaly_dir):
                logger.info('creating crucible metric job directory - %s' %
                            (str(crucible_anomaly_dir)))
                mkdir_p(crucible_anomaly_dir)
        except:
            trace = traceback.format_exc()
            fail_msg = 'error :: failed to create the crucible metric job directory'
            logger.error(trace)
            logger.error(fail_msg)
            raise  # to webapp to return in the UI
        if source == 'graphite':
            graphite_metric = True
        else:
            graphite_metric = False

        # @added 20200422 - Feature #3500: webapp - crucible_process_metrics
        #                   Feature #1448: Crucible web UI
        # In order for metrics to be analysed in Crucible like the Analyzer or
        # Mirage analysis, the time series data needs to be padded
        # Added pad_timeseries
        graphite_override_uri_parameters = 'from=%s&until=%s&target=%s' % (
            str(from_timestamp), str(until_timestamp), target)
        timeseries_full_duration = int(until_timestamp) - int(from_timestamp)
        pad_timeseries_with = 0
        if pad_timeseries == 'auto':
            if timeseries_full_duration > 3600:
                pad_timeseries_with = 3600
            if timeseries_full_duration > 86400:
                pad_timeseries_with = 86400
        if pad_timeseries == '86400':
            pad_timeseries_with = 86400
        if pad_timeseries == '604800':
            pad_timeseries_with = 604800
        if pad_timeseries == '0':
            pad_timeseries_with = 0
        if pad_timeseries_with:
            try:
                padded_from_timestamp = int(
                    from_timestamp) - pad_timeseries_with
                graphite_override_uri_parameters = 'from=%s&until=%s&target=%s' % (
                    str(padded_from_timestamp), str(until_timestamp), target)
                logger.info('padding time series with %s seconds - %s' %
                            (str(pad_timeseries_with),
                             str(graphite_override_uri_parameters)))
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: failed to construct graphite_override_uri_parameters with pad_timeseries_with %s'
                    % str(pad_timeseries_with))

        # @added 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms
        # Allow the user to pass algorithms to run
        algorithms = settings.ALGORITHMS
        if run_algorithms:
            algorithms = run_algorithms

        # @modified 20200421 - Feature #3500: webapp - crucible_process_metrics
        #                      Feature #1448: Crucible web UI
        # Added add_to_panorama
        # @added 20200607 - Feature #3630: webapp - crucible_process_training_data
        # Added training_data_json
        crucible_anomaly_data = 'metric = \'%s\'\n' \
                                'value = \'%s\'\n' \
                                'from_timestamp = \'%s\'\n' \
                                'metric_timestamp = \'%s\'\n' \
                                'algorithms = %s\n' \
                                'triggered_algorithms = %s\n' \
                                'anomaly_dir = \'%s\'\n' \
                                'graphite_metric = %s\n' \
                                'run_crucible_tests = True\n' \
                                'added_by = \'%s\'\n' \
                                'added_at = \'%s\'\n' \
                                'graphite_override_uri_parameters = \'%s\'\n' \
                                'alert_interval = \'%s\'\n' \
                                'add_to_panorama = %s\n' \
                                'training_data_json = %s\n' \
            % (base_name, str(datapoint), str(from_timestamp),
               # @modified 20200817 - Feature #3682: SNAB - webapp - crucible_process - run_algorithms
               # str(until_timestamp), str(settings.ALGORITHMS),
               str(until_timestamp), str(algorithms),
               triggered_algorithms, crucible_anomaly_dir, str(graphite_metric),
               skyline_app, str(added_at), str(graphite_override_uri_parameters),
               str(alert_interval), str(add_to_panorama), str(training_data_json))

        # Create an anomaly file with details about the anomaly
        crucible_anomaly_file = '%s/%s.txt' % (crucible_anomaly_dir,
                                               sane_metricname)
        try:
            write_data_to_file(skyline_app, crucible_anomaly_file, 'w',
                               crucible_anomaly_data)
            logger.info('added crucible anomaly file :: %s' %
                        (crucible_anomaly_file))
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to add crucible anomaly file :: %s' %
                         (crucible_anomaly_file))
        # Create a crucible check file
        crucible_check_file = '%s/%s.%s.txt' % (settings.CRUCIBLE_CHECK_PATH,
                                                str(added_at), sane_metricname)
        try:
            write_data_to_file(skyline_app, crucible_check_file, 'w',
                               crucible_anomaly_data)
            logger.info('added crucible check :: %s,%s' %
                        (base_name, str(added_at)))
            metrics_submitted_to_process.append(base_name)
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to add crucible check file :: %s' %
                         (crucible_check_file))

    return (crucible_job_id, metrics_submitted_to_process, fail_msg, trace)
示例#12
0
def on_demand_motif_analysis(metric, timestamp, similarity, batch_size,
                             top_matches, max_distance, range_padding,
                             max_area_percent_diff):
    """
    Process a motif similarity search on demand
    """
    import numpy as np
    import mass_ts as mts

    logger = logging.getLogger(skyline_app_logger)
    dev_null = None
    function_str = 'on_demand_motif_analysis'
    logger.info(
        '%s :: with parameters :: metric: %s, timestamp: %s, similarity: %s, batch_size:%s, top_matches: %s, max_distance: %s, range_padding: %s, max_area_percent_diff: %s'
        % (function_str, str(metric), str(timestamp), str(similarity),
           str(batch_size), str(top_matches), str(max_distance),
           str(range_padding), str(max_area_percent_diff)))
    trace = 'none'
    fail_msg = 'none'

    start = time.time()
    start_timer = timer()
    metric_vars_dict = {}
    metric_id = 0
    fp_ids = []
    timeseries = []
    not_similar_enough_sample = 0
    not_similar_motifs = 0
    similar_motifs = 0
    exact_motifs = 0
    distance_motifs = 0
    motifs_found = []
    find_exact_matches_run = False
    exact_matches_found = []
    fps_timeseries = {}
    # A motif_analysis dict to add to and return
    motif_analysis = {}
    motif_analysis[metric] = {}
    motif_analysis[metric]['timestamp'] = int(timestamp)
    motif_analysis[metric]['started'] = start
    motif_analysis[metric]['motifs'] = {}
    motif_analysis[metric]['exact_motifs'] = exact_motifs
    motif_analysis[metric]['similar_motifs'] = similar_motifs
    motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs
    motif_analysis[metric][
        'not_similar_enough_sample'] = not_similar_enough_sample
    # @added 20210417 - Feature #4014: Ionosphere - inference
    # Allow the user to define the batch_size per similarity search
    motif_analysis[metric]['batch_size'] = int(batch_size)
    motif_analysis[metric]['top_matches'] = int(top_matches)
    motif_analysis[metric]['max_distance'] = float(max_distance)
    # @added 20210425 - Feature #4014: Ionosphere - inference
    # Added max_area_percent_diff for computing the area under the curve
    motif_analysis[metric]['max_area_percent_diff'] = float(
        max_area_percent_diff)

    fps_checked_for_motifs = []

    metric_dir = metric.replace('.', '/')
    metric_timeseries_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER,
                                          str(timestamp), metric_dir)

    # @added 20210418 - Feature #4014: Ionosphere - inference
    # Allow for the similarity search on saved_training_data
    if 'saved_training_data' in request.args:
        saved_training_data_str = request.args.get('saved_training_data',
                                                   'false')
        if saved_training_data_str == 'true':
            saved_metric_timeseries_dir = '%s_saved/%s/%s' % (
                settings.IONOSPHERE_DATA_FOLDER, str(timestamp), metric_dir)
            if path.exists(saved_metric_timeseries_dir):
                metric_timeseries_dir = saved_metric_timeseries_dir
                logger.info('%s :: using saved training_data dir - %s' %
                            (function_str, saved_metric_timeseries_dir))

    metric_vars_file = '%s/%s.txt' % (metric_timeseries_dir, metric)
    timeseries_json = '%s/%s.json' % (metric_timeseries_dir, metric)
    full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60)
    full_duration_timeseries_json = '%s/%s.mirage.redis.%sh.json' % (
        metric_timeseries_dir, metric, str(full_duration_in_hours))
    try:
        metric_vars_dict = mirage_load_metric_vars(skyline_app,
                                                   metric_vars_file, True)
    except Exception as e:
        logger.error(
            'error :: inference :: failed to load metric variables from check file - %s - %s'
            % (metric_vars_file, e))
    if not metric_vars_dict:
        motif_analysis[metric]['status'] = 'error'
        motif_analysis[metric][
            'reason'] = 'could not load training data variables'
        return motif_analysis

    full_duration = metric_vars_dict['metric_vars']['full_duration']

    # Determine the metric details from the database
    metric_id = 0
    metric_db_object = {}
    try:
        metric_db_object = get_metrics_db_object(metric)
    except Exception as e:
        logger.error('error :: %s :: failed to get_metrics_db_object - %s' %
                     (function_str, e))
    try:
        metric_id = int(metric_db_object['id'])
    except Exception as e:
        logger.error(
            'error :: %s :: failed to determine metric_id from metric_db_object %s - %s'
            % (function_str, str(metric_db_object), e))
        metric_id = 0
    if not metric_id:
        logger.error(
            'error :: %s :: failed to get metric id for %s from the database' %
            (function_str, str(metric)))
        fail_msg = 'failed to get metric id'
        motif_analysis[metric]['status'] = 'error'
        motif_analysis[metric]['reason'] = 'could not determine metric id'
        return motif_analysis, fail_msg, trace

    # @modified 20210419 - Feature #4014: Ionosphere - inference
    # Create a unique dir for each batch_size max_distance
    # motif_images_dir = '%s/motifs' % metric_timeseries_dir
    motif_images_dir = '%s/motifs/batch_size.%s/top_matches.%s/max_distance.%s' % (
        metric_timeseries_dir, str(batch_size), str(top_matches),
        str(max_distance))

    if not path.exists(motif_images_dir):
        # provision motifs image resources
        mkdir_p(motif_images_dir)

    full_durations = [full_duration]
    if path.isfile(full_duration_timeseries_json):
        full_durations = [full_duration, settings.FULL_DURATION]
    logger.info('%s :: full_durations - %s' %
                (function_str, str(full_durations)))

    # Loop through analysis per full_duration
    for full_duration in full_durations:
        start_full_duration = timer()
        fp_ids = []
        try:
            query = 'SELECT id,last_matched from ionosphere WHERE metric_id=%s AND full_duration=%s AND enabled=1 ORDER BY last_matched DESC' % (
                str(metric_id), str(full_duration))
            results = mysql_select(skyline_app, query)
            for row in results:
                fp_ids.append(int(row[0]))
        except Exception as e:
            logger.error(
                'error :: %s :: failed to get fp ids via mysql_select from %s - %s'
                % (function_str, metric, e))

        logger.info('%s :: metric_id: %s, full_duration: %s, fp_ids: %s' %
                    (function_str,
                     (metric_id), str(full_duration), str(fp_ids)))

        if not fp_ids:
            continue

        # Now there are known fps, load the timeseries
        if full_duration == settings.FULL_DURATION:
            timeseries_json_file = full_duration_timeseries_json
        else:
            timeseries_json_file = timeseries_json
        try:
            with open((timeseries_json_file), 'r') as f:
                raw_timeseries = f.read()
            timeseries_array_str = str(raw_timeseries).replace('(',
                                                               '[').replace(
                                                                   ')', ']')
            del raw_timeseries
            timeseries = literal_eval(timeseries_array_str)
            del timeseries_array_str
        except Exception as e:
            logger.error(
                'error :: %s :: failed to load timeseries for %s from %s - %s'
                % (function_str, metric, timeseries_json_file, e))
            continue

        anomalous_timeseries_subsequence = []
        for timestamp_float, value in timeseries[-int(batch_size):]:
            anomalous_timeseries_subsequence.append(
                [int(timestamp_float), value])

        logger.info(
            '%s :: looking for motif in trained fps of full_duration: %s' %
            (function_str, (full_duration)))
        dataset = [float(item[1]) for item in anomalous_timeseries_subsequence]

        max_y = max(dataset)
        min_y = min(dataset)

        # full_y_range = max_y - min_y

        # range_padding_percent = range_padding
        # This was just a test that did not have the desired results
        # if full_y_range < 10:
        #     range_padding_percent = 35
        # if full_y_range < 5:
        #     range_padding_percent = 75
        # if full_y_range < 2:
        #    range_padding_percent = 100

        use_range_padding = ((max_y - min_y) / 100) * range_padding
        if min_y > 0 and (min_y - use_range_padding) > 0:
            min_y_padded = min_y - use_range_padding
        else:
            min_y_padded = min_y
        max_y_padded = max_y + use_range_padding
        if min_y_padded == max_y_padded:
            min_y_padded = min_y_padded - (
                (min_y_padded / 100) * range_padding)
            max_y_padded = max_y_padded + (
                (max_y_padded / 100) * range_padding)

        # anomalous_ts = np.array(dataset)
        anomalous_ts = dataset

        mass2_batch_times = []
        exact_match_times = []

        nan = np.array([np.nan])
        nanj = complex(0.0, float('nan'))
        empty_dists = np.array(nan + nanj)

        # plotted = False
        count = 0

        # fp_ids = [fp_id for index, fp_id in enumerate(fp_ids) if index == 0]

        # motifs_found = []
        # exact_matches_found = []
        # fps_timeseries = {}

        for fp_id in fp_ids:
            if (time.time() - start) >= 20:
                break
            # Attempt to surface the fp timeseries from memcache and/or db
            # @modified 20210424 - Feature #4014: Ionosphere - inference
            #                      Task #4030: refactoring
            fp_timeseries = None
            try:
                fp_timeseries = get_fp_timeseries(skyline_app, metric_id,
                                                  fp_id)
            except Exception as e:
                logger.error(
                    'inference :: did not get fp timeseries with get_fp_timeseries(%s, %s, %s) - %s'
                    % (skyline_app, str(metric_id), str(fp_id), e))
            if not fp_timeseries:
                continue

            relate_dataset = [float(item[1]) for item in fp_timeseries]

            fps_timeseries[fp_id] = fp_timeseries

            current_best_indices = []
            current_best_dists = []
            best_indices = None
            best_dists = None

            try:
                logger.info(
                    '%s :: running mts.mass2_batch fp_id: %s, full_duration: %s, batch_size: %s, top_matches: %s, max_distance: %s, motif_size: %s'
                    % (function_str, str(fp_id), str(full_duration),
                       str(batch_size), str(top_matches), str(max_distance),
                       str(len(anomalous_ts))))

                # @added 20210418 - Feature #4014: Ionosphere - inference
                # Handle top_matches being greater than possible kth that can be found
                # mts.mass2_batch error: kth(=50) out of bounds (16)
                use_top_matches = int(top_matches)
                if (len(fp_timeseries) / int(batch_size)) <= int(top_matches):
                    use_top_matches = round(
                        len(fp_timeseries) / int(batch_size)) - 1
                    if use_top_matches == 2:
                        use_top_matches = 1
                    logger.info(
                        '%s :: adjusting top_matches to %s (the maximum possible top - 1) as kth(=%s) will be out of bounds mts.mass2_batch'
                        %
                        (function_str, str(use_top_matches), str(top_matches)))

                start_mass2_batch = timer()
                # @modified 20210418 - Feature #4014: Ionosphere - inference
                # Handle top_matches being greater than possible kth that can be found
                # best_indices, best_dists = mts.mass2_batch(relate_dataset, anomalous_ts, batch_size=int(batch_size), top_matches=int(top_matches))
                best_indices, best_dists = mts.mass2_batch(
                    relate_dataset,
                    anomalous_ts,
                    batch_size=int(batch_size),
                    top_matches=int(use_top_matches))
                end_mass2_batch = timer()
                mass2_batch_times.append((end_mass2_batch - start_mass2_batch))
                current_best_indices = best_indices.tolist()
                current_best_dists = best_dists.tolist()

                # @added 20210412 - Feature #4014: Ionosphere - inference
                #                   Branch #3590: inference
                # Add fp_id to fps_checked_for_motifs to enable ionosphere to update the
                # motif related columns in the ionosphere database table
                fps_checked_for_motifs.append(fp_id)
            except Exception as e:
                logger.error('error :: %s :: %s mts.mass2_batch error: %s' %
                             (function_str, (fp_id), str(e)))
                continue

            try:
                if str(list(best_dists)) == str(list(empty_dists)):
                    logger.info(
                        '%s :: mts.mass2_batch no similar motif from fp id %s - best_dists: %s'
                        % (function_str, (fp_id), str(list(best_dists))))
                    continue
            except Exception as e:
                dev_null = e

            if not current_best_indices[0]:
                continue
            # if list(best_indices)[0] != anomalous_index:
            #     continue
            # If the best_dists is > 1 they are not very similar
            # if list(best_dists)[0].real > 1.0:
            #     continue
            # if list(best_indices)[0] and best_dists:
            for index, best_dist in enumerate(current_best_dists):
                try:
                    motif_added = False
                    """
                    Note: mass_ts finds similar motifs NOT the same motif, the same motif
                    will result in the best_dists being a nan+nanj
                    So it is DIYed
                    """
                    try:
                        # @modified 20210414 - Feature #4014: Ionosphere - inference
                        #                      Branch #3590: inference
                        # Store the not anomalous motifs
                        # motif = [fp_id, current_best_indices[index], best_dist.real]
                        motif = [
                            fp_id, current_best_indices[index], best_dist.real,
                            anomalous_timeseries_subsequence, full_duration
                        ]
                    except Exception as e:
                        dev_null = e
                        motif = []

                    # if list(best_indices)[0] and best_dists:
                    # If it is greater than 1.0 it is not similar
                    # if best_dist.real > 1.0:
                    # if best_dist.real > IONOSPHERE_INFERENCE_MASS_TS_MAX_DISTANCE:
                    if best_dist.real > float(max_distance):
                        continue
                    else:
                        if motif:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                    if not motif_added:
                        if best_dist == nanj:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                    if not motif_added:
                        if str(best_dist) == 'nan+nanj':
                            count += 1
                            motifs_found.append([
                                fp_id, current_best_indices[index], 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                            motif_added = True
                    if not motif_added:
                        if best_dist == empty_dists:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                except Exception as e:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: %s :: could not determine is if fp id %s timeseries at index %s was a match - %s'
                        % (function_str, str(fp_id),
                           str(current_best_indices[index]), e))
                    continue

            # FIND EXACT MATCHES
            # Seeing as I cannot reproduce finding nan+nanj which represents an
            # exact match with mts.mass2_batch, do it DIY style - iterate the
            # timeseries and create a batch_size subsequence for every index and
            # compare the values to the anomalous_ts for an exact match.
            # This takes ~0.024850 seconds on a timeseries with 10079 datapoints
            # @modified 20210418 - Feature #4014: Ionosphere - inference
            # However fiding exact matches can add ~2.5 seconds on 90 minute
            # batch_size and with a proproptionally scaled max_distance of say 15
            # finding an exact match in a longer sequence is less important,
            # the greater the batch_size the most likely greater the variability
            # and the chance of an exact match decreases.  So save 2.5 seconds.
            # UNLESS
            # At a 5 (to 10) batch_size and max_distance of 1.0 an exact match
            # can be found. Exact matches are quite frequent and sometimes with
            # such little variability, similar matchs may not be found.
            # Therefore find find_exact_matches has its place.  MASS
            # A CAVEAT here is that boring metrics and that change and have a
            # low variability even at a larger batch_size could also benefit and
            # possibly achieve better accruracy from the use of find_exact_matches
            # as they can be shapelets resemble a batch_size 5 shapelet.
            # It would perhaps be possible to use one or more of the features
            # profile tsfresh values to identify these types of shapelets, if
            # you knew which feature/s were most descriptive of this type of
            # shapelet, e.g. 'value__skewness': 3.079477685394873, etc (maybe)
            # However I predict that this method will perform worst on these
            # types of shapelets.
            # find_exact_matches = False
            # exact matches can be found in batch sizes of 500 and similar not
            # So actually always run it.
            find_exact_matches = True
            find_exact_matches_run = True

            if int(batch_size) < 10:
                find_exact_matches = True
                find_exact_matches_run = True

            if find_exact_matches:
                try:
                    start_exact_match = timer()
                    indexed_relate_dataset = []
                    for index, item in enumerate(relate_dataset):
                        indexed_relate_dataset.append([index, item])
                    last_index = indexed_relate_dataset[-1][0]
                    current_index = 0
                    while current_index < last_index:
                        subsequence = [
                            value for index, value in
                            indexed_relate_dataset[current_index:(
                                current_index + int(batch_size))]
                        ]
                        if subsequence == anomalous_ts:
                            exact_matches_found.append([
                                fp_id, current_index, 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                            motifs_found.append([
                                fp_id, current_index, 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                        current_index += 1
                    end_exact_match = timer()
                    exact_match_times.append(
                        (end_exact_match - start_exact_match))
                except Exception as e:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: %s :: could not determine it any exact matches could be found in fp id %s timeseries - %s'
                        % (function_str, str(fp_id), e))

        logger.info(
            '%s :: mts.mass2_batch runs on %s fps of full_duration %s in %.6f seconds'
            % (function_str, str(len(mass2_batch_times)), str(full_duration),
               sum(mass2_batch_times)))
        if find_exact_matches_run:
            logger.info(
                '%s :: exact_match runs on %s fps of full_duration %s in %.6f seconds'
                % (function_str, str(len(exact_match_times)),
                   str(full_duration), sum(exact_match_times)))
        end_full_duration = timer()
        logger.info(
            '%s :: analysed %s fps of full_duration %s in %.6f seconds' %
            (function_str, str(len(fp_ids)), str(full_duration),
             (end_full_duration - start_full_duration)))

        # Patterns are sorted
        sorted_motifs = []
        motifs_found_in_fps = []
        if motifs_found:
            sorted_motifs = sorted(motifs_found, key=lambda x: x[2])
            for item in sorted_motifs:
                motifs_found_in_fps.append(item[0])
        logger.info('%s :: %s motifs found' %
                    (function_str, str(len(sorted_motifs))))

        for motif in sorted_motifs:
            if (time.time() - start) >= 25:
                break
            try:
                add_match = False
                all_in_range = False

                fp_id = motif[0]
                best_index = motif[1]
                best_dist = motif[2]

                # @added 20210414 - Feature #4014: Ionosphere - inference
                #                   Branch #3590: inference
                # Store the not anomalous motifs
                motif_sequence = motif[3]

                motif_full_duration = motif[4]

                match_type = 'not_similar_enough'

                if motif in exact_matches_found:
                    add_match = True
                    match_type = 'exact'
                    all_in_range = True
                    exact_motifs += 1
                full_relate_timeseries = fps_timeseries[fp_id]
                # full_relate_dataset = [float(item[1]) for item in full_relate_timeseries]
                relate_timeseries = [
                    item for index, item in enumerate(full_relate_timeseries)
                    if index >= best_index and index < (best_index +
                                                        int(batch_size))
                ]
                relate_dataset = [item[1] for item in relate_timeseries]

                if not add_match:
                    all_in_range = True
                    for value in relate_dataset:
                        if value < min_y_padded:
                            all_in_range = False
                            break
                        if value > max_y_padded:
                            all_in_range = False
                            break
                    if all_in_range:
                        related_max_y = max(relate_dataset)
                        if related_max_y < (max_y - range_padding):
                            all_in_range = False
                        if related_max_y > (max_y + range_padding):
                            all_in_range = False
                        related_min_y = min(relate_dataset)
                        if related_min_y < (min_y - range_padding):
                            all_in_range = False
                        if related_min_y > (min_y + range_padding):
                            all_in_range = False
                    if all_in_range:
                        logger.info(
                            '%s :: ALL IN RANGE - all_in_range: %s, motif: %s'
                            % (function_str, str(all_in_range),
                               str(relate_dataset[0:2])))
                        add_match = True
                        match_type = 'all_in_range'
                        similar_motifs += 1

                    # @added 20210425 - Feature #4014: Ionosphere - inference
                    # Compute the area using the composite trapezoidal rule.
                    motif_area = None
                    fp_motif_area = None
                    percent_different = None
                    try:
                        batch_size_dataset = [
                            float(item[1]) for item in motif_sequence
                        ]
                        y_motif = np.array(batch_size_dataset)
                        motif_area = np.trapz(y_motif, dx=1)
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get motif_area with np.trapz - %s'
                            % (function_str, e))
                    try:
                        y_fp_motif = np.array(relate_dataset)
                        fp_motif_area = np.trapz(y_fp_motif, dx=1)
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get fp_motif_area with np.trapz - %s'
                            % (function_str, e))
                    # Determine the percentage difference (as a
                    # positive value) of the areas under the
                    # curves.
                    if motif_area and fp_motif_area:
                        percent_different = get_percent_different(
                            fp_motif_area, motif_area, True)
                        if percent_different > max_area_percent_diff:
                            if add_match:
                                logger.info(
                                    '%s :: AREA TOO DIFFERENT - not adding all_in_range match'
                                    % (function_str))
                                add_match = False
                            # BUT ...
                            if best_dist < 3 and not add_match:
                                logger.info(
                                    '%s :: DISTANCE VERY SIMILAR - adding match even though area_percent_diff is greater than max_area_percent_diff because best_dist: %s'
                                    % (function_str, str(best_dist)))
                                add_match = True
                                match_type = 'distance'
                                distance_motifs += 1

                if similarity == 'all':
                    if not add_match:
                        not_similar_motifs += 1
                        if not_similar_enough_sample >= 10:
                            continue
                        not_similar_enough_sample += 1
                        add_match = True
                        match_type = 'not_similar_enough'

                if add_match:
                    generation = 0
                    fp_id_row = None
                    try:
                        fp_id_row = get_ionosphere_fp_db_row(
                            skyline_app, int(fp_id))
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get_ionosphere_fp_db_row for fp_id %s - %s'
                            % (function_str, str(fp_id), e))
                    if fp_id_row:
                        try:
                            generation = fp_id_row['generation']
                        except Exception as e:
                            logger.error(
                                'error :: %s :: failed to generation from fp_id_row for fp_id %s - %s'
                                % (function_str, str(fp_id), e))
                    if generation == 0:
                        generation_str = 'trained'
                    else:
                        generation_str = 'LEARNT'
                    motif_match_types = motif_match_types_dict()
                    type_id = motif_match_types[match_type]

                    motif_id = '%s-%s' % (str(fp_id), str(best_index))
                    motif_analysis[metric]['motifs'][motif_id] = {}
                    motif_analysis[metric]['motifs'][motif_id][
                        'metric_id'] = metric_id
                    motif_analysis[metric]['motifs'][motif_id]['fp_id'] = fp_id
                    motif_analysis[metric]['motifs'][motif_id][
                        'generation'] = generation
                    motif_analysis[metric]['motifs'][motif_id][
                        'index'] = best_index
                    motif_analysis[metric]['motifs'][motif_id][
                        'distance'] = best_dist
                    motif_analysis[metric]['motifs'][motif_id]['size'] = int(
                        batch_size)
                    motif_analysis[metric]['motifs'][motif_id][
                        'max_distance'] = float(max_distance)
                    motif_analysis[metric]['motifs'][motif_id][
                        'timestamp'] = timestamp
                    motif_analysis[metric]['motifs'][motif_id][
                        'type_id'] = type_id
                    motif_analysis[metric]['motifs'][motif_id][
                        'type'] = match_type
                    motif_analysis[metric]['motifs'][motif_id][
                        'full_duration'] = motif_full_duration
                    # @added 20210414 - Feature #4014: Ionosphere - inference
                    #                   Branch #3590: inference
                    # Store the not anomalous motifs
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_timeseries'] = anomalous_timeseries_subsequence
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_sequence'] = motif_sequence
                    not_anomalous_timestamp = int(
                        anomalous_timeseries_subsequence[-1][0])
                    graph_period_seconds = not_anomalous_timestamp - int(
                        anomalous_timeseries_subsequence[0][0])
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_period_seconds'] = graph_period_seconds
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_period_minutes'] = round(graph_period_seconds /
                                                        60)

                    motif_analysis[metric]['motifs'][motif_id]['image'] = None

                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_area'] = motif_area
                    motif_analysis[metric]['motifs'][motif_id][
                        'fp_motif_area'] = fp_motif_area
                    motif_analysis[metric]['motifs'][motif_id][
                        'area_percent_diff'] = percent_different
                    motif_analysis[metric]['motifs'][motif_id][
                        'max_area_percent_diff'] = max_area_percent_diff

                    if (time.time() - start) >= 25:
                        continue

                    graph_image_file = '%s/motif.%s.%s.%s.with_max_distance.%s.png' % (
                        motif_images_dir, motif_id, match_type,
                        str(batch_size), str(max_distance))
                    plotted_image = False
                    on_demand_motif_analysis = True
                    if not path.isfile(graph_image_file):
                        plotted_image, plotted_image_file = plot_motif_match(
                            skyline_app, metric, timestamp, fp_id,
                            full_duration, generation_str, motif_id,
                            best_index, int(batch_size), best_dist, type_id,
                            relate_dataset, anomalous_timeseries_subsequence,
                            graph_image_file, on_demand_motif_analysis)
                    else:
                        plotted_image = True
                        logger.info('%s :: plot already exists - %s' %
                                    (function_str, str(graph_image_file)))
                    if plotted_image:
                        motif_analysis[metric]['motifs'][motif_id][
                            'image'] = graph_image_file
                    else:
                        logger.error('failed to plot motif match plot')
                        graph_image_file = None
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: inference :: with fp id %s proceesing motif at index: %s - %s'
                    % (str(fp_id), str(motif[0]), str(e)))
                continue
    end_timer = timer()
    motif_analysis[metric]['fps_checked'] = fps_checked_for_motifs
    motif_analysis[metric]['exact_motifs'] = exact_motifs
    motif_analysis[metric]['similar_motifs'] = similar_motifs
    motif_analysis[metric]['distance_motifs'] = distance_motifs
    motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs
    motif_analysis[metric][
        'not_similar_enough_sample'] = not_similar_enough_sample

    motif_analysis_file = '%s/motif.analysis.similarity_%s.batch_size_%s.top_matches_%s.max_distance_%s.dict' % (
        motif_images_dir, similarity, str(batch_size), str(top_matches),
        str(max_distance))
    try:
        write_data_to_file(skyline_app, motif_analysis_file, 'w',
                           str(motif_analysis))
    except Exception as e:
        trace = traceback.format_exc()
        logger.error('%s' % trace)
        fail_msg = '%s :: error :: failed to write motif_analysis_file - %s' % (
            function_str, motif_analysis_file)
        logger.error('%s' % fail_msg)
        dev_null = e

    motif_ids = list(motif_analysis[metric]['motifs'].keys())
    logger.info(
        '%s :: %s motif matches found, %s fps where checked and motifs plotted in %.6f seconds for %s'
        % (function_str, str(len(motif_ids)), str(len(fps_checked_for_motifs)),
           (end_timer - start_timer), metric))
    if dev_null:
        del dev_null
    return motif_analysis, fail_msg, trace