Exemplo n.º 1
0
def significant_change_window_percent_sustained(current_skyline_app,
                                                parent_pid, timeseries,
                                                algorithm_parameters):
    """
    A data point is anomalous if it is x percent different from the median of the
    window (seconds resample) of the last p period (seconds). A few examples,

    If the value is 10% different from the median value of the 10min windows of
    the last hour.
    algorithm_parameters: {'window': 600, 'percent': 10.0, 'period': 3600}

    If the value is 50% different from the median value of the 10min windows of
    the last day.
    algorithm_parameters: {'window': 600, 'percent': 50.0, 'period': 86400}

    :param current_skyline_app: the Skyline app executing the algorithm.  This
        will be passed to the algorithm by Skyline.  This is **required** for
        error handling and logging.  You do not have to worry about handling the
        argument in the scope of the custom algorithm itself,  but the algorithm
        must accept it as the first agrument.
    :param parent_pid: the parent pid which is executing the algorithm, this is
        **required** for error handling and logging.  You do not have to worry
        about handling this argument in the scope of algorithm, but the
        algorithm must accept it as the second argument.
    :param timeseries: the time series as a list e.g. ``[[1578916800.0, 29.0],
        [1578920400.0, 55.0], ... [1580353200.0, 55.0]]``
    :param algorithm_parameters: a dictionary of any parameters and their
        arguments you wish to pass to the algorithm.
    :type current_skyline_app: str
    :type parent_pid: int
    :type timeseries: list
    :type algorithm_parameters: dict
    :return: True, False or Non
    :rtype: boolean

    """

    # You MUST define the algorithm_name
    algorithm_name = 'significant_change_window_percent_sustained'

    # If you wanted to log, you can but this should only be done during
    # testing and development
    def get_log(current_skyline_app):
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        return current_logger

    # Define the default state of None and None, anomalous does not default to
    # False as that is not correct, False is only correct if the algorithm
    # determines the data point is not anomalous.  The same is true for the
    # anomalyScore.
    anomalous = None
    anomalyScore = None

    # Use the algorithm_parameters to determine the sample_period
    debug_logging = None
    try:
        debug_logging = algorithm_parameters['debug_logging']
    except:
        debug_logging = False
    if debug_logging:
        try:
            current_logger = get_log(current_skyline_app)
            current_logger.debug(
                'debug :: %s :: debug_logging enabled with algorithm_parameters - %s'
                % (algorithm_name, str(algorithm_parameters)))
        except:
            # This except pattern MUST be used in ALL custom algortihms to
            # facilitate the traceback from any errors.  The algorithm we want to
            # run super fast and without spamming the log with lots of errors.
            # But we do not want the function returning and not reporting
            # anything to the log, so the pythonic except is used to "sample" any
            # algorithm errors to a tmp file and report once per run rather than
            # spewing tons of errors into the log e.g. analyzer.log
            record_algorithm_error(current_skyline_app, parent_pid,
                                   algorithm_name, traceback.format_exc())
            # Return None and None as the algorithm could not determine True or False
            return (anomalous, anomalyScore)

    # window in seconds
    window = 600
    try:
        window = algorithm_parameters['window']
    except:
        pass
    resample_window = '%sS' % str(window)

    # Allow the LevelShiftAD c parameter to be passed in the
    # algorithm_parameters
    percent = 10.0
    try:
        percent = algorithm_parameters['percent']
    except:
        pass

    period = 3600
    try:
        period = algorithm_parameters['period']
    except:
        pass

    return_percent_as_anomalyScore = False
    try:
        return_percent_as_anomalyScore = algorithm_parameters[
            'return_percent_as_anomalyScore']
    except:
        pass

    times_in_a_row = 0
    try:
        times_in_a_row = algorithm_parameters['times_in_a_row']
    except:
        pass

    # To test max_execution_time enable a sleep
    # sleep(1)

    # ALWAYS WRAP YOUR ALGORITHM IN try and the BELOW except
    try:
        timestamp = int(timeseries[-1][0])
        value = timeseries[-1][1]
        values = []
        if times_in_a_row:
            values = [item[1] for item in timeseries[-times_in_a_row:]]
        from_timestamp = timestamp - period
        applicable_timeseries = [
            item for item in timeseries if int(item[0] >= from_timestamp)
        ]
        del timeseries
        df = pd.DataFrame(applicable_timeseries, columns=['date', 'value'])
        df['date'] = pd.to_datetime(df['date'], unit='s')
        datetime_index = pd.DatetimeIndex(df['date'].values)
        df = df.set_index(datetime_index)
        df.drop('date', axis=1, inplace=True)
        resampled_df = df.resample(resample_window, origin='epoch').median()
        del df
        resampled_values = resampled_df['value'].values.tolist()
        series = pd.Series([x for x in resampled_values])
        del resampled_df
        median = series.median()
        percent_different = get_percent_different(median, value, False)
        if percent_different > percent:
            anomalous = True
            anomalyScore = 1.0
        else:
            anomalous = False
            anomalyScore = 0.0
        if return_percent_as_anomalyScore:
            anomalyScore = percent_different
        anomalous_values = []
        anomalous_count = 0
        if values:
            for v in values:
                percent_different = get_percent_different(median, v, False)
                if percent_different > percent:
                    anomalous = True
                    anomalyScore = 1.0
                    anomalous_count += 1
                else:
                    anomalous = False
                    anomalyScore = 0.0
                if return_percent_as_anomalyScore:
                    anomalyScore = percent_different
                anomalous_values.append([anomalous, anomalyScore])
        if anomalous_values and anomalous_count:
            if anomalous_count == times_in_a_row:
                anomalous = True
                anomalyScores = [item[1] for item in anomalous_values]
                anomalyScore = sum(anomalyScores) / len(anomalyScores)
    except StopIteration:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        return (None, None)
    except:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        return (None, None)
    return (anomalous, anomalyScore)
Exemplo n.º 2
0
def minmax_scale_check(fp_id_metric_ts, anomalous_timeseries, range_tolerance,
                       range_tolerance_percentage, fp_id, base_name,
                       metric_timestamp, features_percentage_diff):
    """
    Called by nothing yet.  Used to run a minmax scaling check and determine if
    the features_sum of the 2 minmax scaled timeseries match.

    :param fp_id_metric_ts:
    :param anomalous_timeseries:
    :param range_tolerance:
    :param range_tolerance_percentage
    :param fp_id:
    :param base_name:
    :param metric_timestamp:
    :param features_percentage_diff:
    :type fp_id_metric_ts: int
    :return: (minmax_not_anomalous, minmax_fp_features_sum, minmax_fp_features_count, minmax_anomalous_features_sum, minmax_anomalous_features_count)
    :rtype: tuple

    """

    # @modified 20191115 - Branch #3262: py3
    # not_anomalous = False

    try:
        minmax_fp_values = [x[1] for x in fp_id_metric_ts]
        min_fp_value = min(minmax_fp_values)
        max_fp_value = max(minmax_fp_values)
    except:
        min_fp_value = False
        max_fp_value = False
    try:
        minmax_anomalous_values = [x2[1] for x2 in anomalous_timeseries]
        min_anomalous_value = min(minmax_anomalous_values)
        max_anomalous_value = max(minmax_anomalous_values)
    except:
        min_anomalous_value = False
        max_anomalous_value = False
    lower_range_not_same = True
    try:
        if int(min_fp_value) == int(min_anomalous_value):
            lower_range_not_same = False
            lower_range_similar = True
            logger.info(
                'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                % (str(min_fp_value), str(min_anomalous_value)))
    except:
        lower_range_not_same = True
    if min_fp_value and min_anomalous_value and lower_range_not_same:
        if int(min_fp_value) == int(min_anomalous_value):
            lower_range_similar = True
            logger.info(
                'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                % (str(min_fp_value), str(min_anomalous_value)))
        else:
            lower_min_fp_value = int(min_fp_value -
                                     (min_fp_value * range_tolerance))
            upper_min_fp_value = int(min_fp_value +
                                     (min_fp_value * range_tolerance))
            if int(min_anomalous_value) in range(lower_min_fp_value,
                                                 upper_min_fp_value):
                lower_range_similar = True
                logger.info(
                    'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other'
                    % (str(min_fp_value), str(min_anomalous_value),
                       str(range_tolerance_percentage)))
    if not lower_range_similar:
        logger.info(
            'lower range of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar'
            % (str(min_fp_value), str(min_anomalous_value)))
    upper_range_not_same = True
    try:
        if int(max_fp_value) == int(max_anomalous_value):
            upper_range_not_same = False
            upper_range_similar = True
            logger.info(
                'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                % (str(max_fp_value), str(max_anomalous_value)))
    except:
        upper_range_not_same = True
    if max_fp_value and max_anomalous_value and lower_range_similar and upper_range_not_same:
        # @added 20180717 - Task #2446: Optimize Ionosphere
        #                   Feature #2404: Ionosphere - fluid approximation
        # On low values such as 1 and 2, the range_tolerance
        # should be adjusted to account for the very small
        # range. TODO
        lower_max_fp_value = int(max_fp_value -
                                 (max_fp_value * range_tolerance))
        upper_max_fp_value = int(max_fp_value +
                                 (max_fp_value * range_tolerance))
        if int(max_anomalous_value) in range(lower_max_fp_value,
                                             upper_max_fp_value):
            upper_range_similar = True
            logger.info(
                'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other'
                % (str(max_fp_value), str(max_anomalous_value),
                   str(range_tolerance_percentage)))
        else:
            logger.info(
                'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar'
                % (str(max_fp_value), str(max_anomalous_value)))

    if lower_range_similar and upper_range_similar:
        range_similar = True
    else:
        logger.info(
            'the ranges of fp_id_metric_ts and anomalous_timeseries differ significantly Min-Max scaling will be skipped'
        )

    minmax_fp_ts = []
    # if fp_id_metric_ts:
    if range_similar:
        if LOCAL_DEBUG:
            logger.debug(
                'debug :: creating minmax_fp_ts from minmax scaled fp_id_metric_ts'
            )
        try:
            minmax_fp_values = [x[1] for x in fp_id_metric_ts]
            x_np = np.asarray(minmax_fp_values)
            # Min-Max scaling
            np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min())
            for (ts, v) in zip(fp_id_metric_ts, np_minmax):
                minmax_fp_ts.append([ts[0], v])
            logger.info(
                'minmax_fp_ts list populated with the minmax scaled time series with %s data points'
                % str(len(minmax_fp_ts)))
        except:
            logger.error(traceback.format_exc())
            logger.error(
                'error :: could not minmax scale fp id %s time series for %s' %
                (str(fp_id), str(base_name)))
        if not minmax_fp_ts:
            logger.error('error :: minmax_fp_ts list not populated')

    minmax_anomalous_ts = []
    anomalous_ts_values_count = len(anomalous_timeseries)
    if minmax_fp_ts:
        # Only process if they are approximately the same length
        minmax_fp_ts_values_count = len(minmax_fp_ts)
        if minmax_fp_ts_values_count - anomalous_ts_values_count in range(
                -14, 14):
            try:
                minmax_anomalous_values = [
                    x2[1] for x2 in anomalous_timeseries
                ]
                x_np = np.asarray(minmax_anomalous_values)
                # Min-Max scaling
                np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min())
                for (ts, v) in zip(fp_id_metric_ts, np_minmax):
                    minmax_anomalous_ts.append([ts[0], v])
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: could not minmax scale current time series anomalous_timeseries for %s'
                    % (str(fp_id), str(base_name)))
            if len(minmax_anomalous_ts) > 0:
                logger.info(
                    'minmax_anomalous_ts is populated with %s data points' %
                    str(len(minmax_anomalous_ts)))
            else:
                logger.error('error :: minmax_anomalous_ts is not populated')
        else:
            logger.info(
                'minmax scaled check will be skipped - anomalous_ts_values_count is %s and minmax_fp_ts is %s'
                % (str(anomalous_ts_values_count),
                   str(minmax_fp_ts_values_count)))

    minmax_fp_ts_csv = '%s/fpid.%s.%s.minmax_fp_ts.tsfresh.input.std.csv' % (
        settings.SKYLINE_TMP_DIR, str(fp_id), base_name)
    minmax_fp_fname_out = minmax_fp_ts_csv + '.transposed.csv'
    anomalous_ts_csv = '%s/%s.%s.minmax_anomalous_ts.tsfresh.std.csv' % (
        settings.SKYLINE_TMP_DIR, metric_timestamp, base_name)
    anomalous_fp_fname_out = anomalous_ts_csv + '.transposed.csv'

    # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
    # tsf_settings = ReasonableFeatureExtractionSettings()
    # tsf_settings.disable_progressbar = True

    minmax_fp_features_sum = None
    minmax_anomalous_features_sum = None
    if minmax_anomalous_ts and minmax_fp_ts:
        if LOCAL_DEBUG:
            logger.debug(
                'debug :: analyzing minmax_fp_ts and minmax_anomalous_ts')
        if not os.path.isfile(minmax_fp_ts_csv):
            if LOCAL_DEBUG:
                logger.debug('debug :: creating %s from minmax_fp_ts' %
                             minmax_fp_ts_csv)
            datapoints = minmax_fp_ts
            converted = []
            for datapoint in datapoints:
                try:
                    new_datapoint = [float(datapoint[0]), float(datapoint[1])]
                    converted.append(new_datapoint)
                except:  # nosec
                    continue
            if LOCAL_DEBUG:
                if len(converted) > 0:
                    logger.debug('debug :: converted is populated')
                else:
                    logger.debug(
                        'debug :: error :: converted is not populated')
            for ts, value in converted:
                try:
                    utc_ts_line = '%s,%s,%s\n' % (base_name, str(
                        int(ts)), str(value))
                    with open(minmax_fp_ts_csv, 'a') as fh:
                        fh.write(utc_ts_line)
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: could not write to file %s' %
                                 (str(minmax_fp_ts_csv)))
        else:
            logger.info('file found %s, using for data' % minmax_fp_ts_csv)

        if not os.path.isfile(minmax_fp_ts_csv):
            logger.error('error :: file not found %s' % minmax_fp_ts_csv)
        else:
            logger.info(
                'file exists to create the minmax_fp_ts data frame from - %s' %
                minmax_fp_ts_csv)

        try:
            df = pd.read_csv(minmax_fp_ts_csv,
                             delimiter=',',
                             header=None,
                             names=['metric', 'timestamp', 'value'])
            df.columns = ['metric', 'timestamp', 'value']
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to created data frame from %s' %
                         (str(minmax_fp_ts_csv)))
        try:
            df_features = extract_features(
                # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
                # df, column_id='metric', column_sort='timestamp', column_kind=None,
                # column_value=None, feature_extraction_settings=tsf_settings)
                df,
                default_fc_parameters=EfficientFCParameters(),
                column_id='metric',
                column_sort='timestamp',
                column_kind=None,
                column_value=None,
                disable_progressbar=True)
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to created df_features from %s' %
                         (str(minmax_fp_ts_csv)))
        # Create transposed features csv
        if not os.path.isfile(minmax_fp_fname_out):
            # Transpose
            df_t = df_features.transpose()
            df_t.to_csv(minmax_fp_fname_out)
        else:
            if LOCAL_DEBUG:
                logger.debug('debug :: file exists - %s' % minmax_fp_fname_out)
        try:
            # Calculate the count and sum of the features values
            df_sum = pd.read_csv(minmax_fp_fname_out,
                                 delimiter=',',
                                 header=0,
                                 names=['feature_name', 'value'])
            df_sum.columns = ['feature_name', 'value']
            df_sum['feature_name'] = df_sum['feature_name'].astype(str)
            df_sum['value'] = df_sum['value'].astype(float)
            minmax_fp_features_count = len(df_sum['value'])
            minmax_fp_features_sum = df_sum['value'].sum()
            logger.info(
                'minmax_fp_ts - features_count: %s, features_sum: %s' %
                (str(minmax_fp_features_count), str(minmax_fp_features_sum)))
        except:
            logger.error(traceback.format_exc())
            logger.error('error :: failed to created df_sum from %s' %
                         (str(minmax_fp_fname_out)))

        if minmax_fp_features_count > 0:
            if LOCAL_DEBUG:
                logger.debug(
                    'debug :: minmax_fp_features_count of the minmax_fp_ts is %s'
                    % str(minmax_fp_features_count))
        else:
            logger.error('error :: minmax_fp_features_count is %s' %
                         str(minmax_fp_features_count))

        if not os.path.isfile(anomalous_ts_csv):
            datapoints = minmax_anomalous_ts
            converted = []
            for datapoint in datapoints:
                try:
                    new_datapoint = [float(datapoint[0]), float(datapoint[1])]
                    converted.append(new_datapoint)
                except:  # nosec
                    continue
            for ts, value in converted:
                utc_ts_line = '%s,%s,%s\n' % (base_name, str(
                    int(ts)), str(value))
                with open(anomalous_ts_csv, 'a') as fh:
                    fh.write(utc_ts_line)

        df = pd.read_csv(anomalous_ts_csv,
                         delimiter=',',
                         header=None,
                         names=['metric', 'timestamp', 'value'])
        df.columns = ['metric', 'timestamp', 'value']
        df_features_current = extract_features(
            # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
            # df, column_id='metric', column_sort='timestamp', column_kind=None,
            # column_value=None, feature_extraction_settings=tsf_settings)
            df,
            default_fc_parameters=EfficientFCParameters(),
            column_id='metric',
            column_sort='timestamp',
            column_kind=None,
            column_value=None,
            disable_progressbar=True)

        # Create transposed features csv
        if not os.path.isfile(anomalous_fp_fname_out):
            # Transpose
            df_t = df_features_current.transpose()
            df_t.to_csv(anomalous_fp_fname_out)
        # Calculate the count and sum of the features values
        df_sum_2 = pd.read_csv(anomalous_fp_fname_out,
                               delimiter=',',
                               header=0,
                               names=['feature_name', 'value'])
        df_sum_2.columns = ['feature_name', 'value']
        df_sum_2['feature_name'] = df_sum_2['feature_name'].astype(str)
        df_sum_2['value'] = df_sum_2['value'].astype(float)
        minmax_anomalous_features_count = len(df_sum_2['value'])
        minmax_anomalous_features_sum = df_sum_2['value'].sum()
        logger.info(
            'minmax_anomalous_ts - minmax_anomalous_features_count: %s, minmax_anomalous_features_sum: %s'
            % (str(minmax_anomalous_features_count),
               str(minmax_anomalous_features_sum)))

    if minmax_fp_features_sum and minmax_anomalous_features_sum:
        percent_different = None
        # @modified 20210425 - Task #4030: refactoring
        #                      Feature #4014: Ionosphere - inference
        # Use the common function added
        # try:
        #     fp_sum_array = [minmax_fp_features_sum]
        #     calc_sum_array = [minmax_anomalous_features_sum]
        #     percent_different = 100
        #     sums_array = np.array([minmax_fp_features_sum, minmax_anomalous_features_sum], dtype=float)
        #     calc_percent_different = np.diff(sums_array) / sums_array[:-1] * 100.
        #     percent_different = calc_percent_different[0]
        #     logger.info('percent_different between minmax scaled features sums - %s' % str(percent_different))
        # except:
        #     logger.error(traceback.format_exc())
        #     logger.error('error :: failed to calculate percent_different from minmax scaled features sums')
        try:
            percent_different = get_percent_different(
                minmax_fp_features_sum, minmax_anomalous_features_sum, True)
            logger.info(
                'percent_different between minmax scaled features sums - %s' %
                str(percent_different))
        except Exception as e:
            logger.error(
                'error :: failed to calculate percent_different between minmax scaled features sums - %s'
                % e)

        if percent_different:
            almost_equal = None
            try:
                # np.testing.assert_array_almost_equal(fp_sum_array, calc_sum_array)
                np.testing.assert_array_almost_equal(
                    minmax_fp_features_sum, minmax_anomalous_features_sum)
                almost_equal = True
            except:
                almost_equal = False

            if almost_equal:
                minmax_not_anomalous = True
                logger.info(
                    'minmax scaled common features sums are almost equal, not anomalous'
                )

            # if diff_in_sums <= 1%:
            if percent_different < 0:
                new_pdiff = percent_different * -1
                percent_different = new_pdiff

            if percent_different < float(features_percentage_diff):
                minmax_not_anomalous = True
                # log
                logger.info(
                    'not anomalous - minmax scaled features profile match - %s - %s'
                    % (base_name, str(minmax_not_anomalous)))
                logger.info(
                    'minmax scaled calculated features sum are within %s percent of fp_id %s with %s, not anomalous'
                    % (str(features_percentage_diff), str(fp_id),
                       str(percent_different)))

            # @modified 20191115 - Branch #3262: py3
            # if minmax_not_anomalous:
            #     not_anomalous = True
            #     minmax = 1

            # Created time series resources for graphing in
            # the matched page

    return (minmax_not_anomalous, minmax_fp_features_sum,
            minmax_fp_features_count, minmax_anomalous_features_sum,
            minmax_anomalous_features_count)
Exemplo n.º 3
0
def calculate_features_other_minmax(use_file, i_json_file, metric):

    fp_id = 'testing.feature2484'
    base_name = metric
    metric_timestamp = 'none'

    not_anomalous = False
    minmax_not_anomalous = False
    minmax = 0
    minmax_check = True

    with open(use_file, 'r') as f:
        raw_timeseries = f.read()
    # Convert the timeseries to csv
    timeseries_array_str = str(raw_timeseries).replace('(',
                                                       '[').replace(')', ']')
    del raw_timeseries
    anomalous_timeseries = literal_eval(timeseries_array_str)
    anomalous_ts_values_count = len(anomalous_timeseries)

    with open(i_json_file, 'r') as f:
        fp_raw_timeseries = f.read()
    # Convert the timeseries to csv
    fp_timeseries_array_str = str(fp_raw_timeseries).replace('(', '[').replace(
        ')', ']')
    del fp_raw_timeseries
    fp_id_metric_ts = literal_eval(fp_timeseries_array_str)
    fp_id_metric_ts_values_count = len(fp_id_metric_ts)

    try:
        range_tolerance = settings.IONOSPHERE_MINMAX_SCALING_RANGE_TOLERANCE
    except:
        range_tolerance = 0.15
    range_tolerance_percentage = range_tolerance * 100
    check_range = False
    range_similar = False
    if fp_id_metric_ts:
        if anomalous_ts_values_count > 0:
            check_range = True
    lower_range_similar = False
    upper_range_similar = False

    min_fp_value = None
    min_anomalous_value = None
    max_fp_value = None
    max_anomalous_value = None

    if check_range:
        try:
            minmax_fp_values = [x[1] for x in fp_id_metric_ts]
            min_fp_value = min(minmax_fp_values)
            max_fp_value = max(minmax_fp_values)
        except:
            min_fp_value = False
            max_fp_value = False
        try:
            minmax_anomalous_values = [x2[1] for x2 in anomalous_timeseries]
            min_anomalous_value = min(minmax_anomalous_values)
            max_anomalous_value = max(minmax_anomalous_values)
        except:
            min_anomalous_value = False
            max_anomalous_value = False
        lower_range_not_same = True
        try:
            if int(min_fp_value) == int(min_anomalous_value):
                lower_range_not_same = False
                lower_range_similar = True
                print(
                    'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                    % (str(min_fp_value), str(min_anomalous_value)))
        except:
            lower_range_not_same = True
        if min_fp_value and min_anomalous_value and lower_range_not_same:
            if int(min_fp_value) == int(min_anomalous_value):
                lower_range_similar = True
                print(
                    'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                    % (str(min_fp_value), str(min_anomalous_value)))
            else:
                lower_min_fp_value = int(min_fp_value -
                                         (min_fp_value * range_tolerance))
                upper_min_fp_value = int(min_fp_value +
                                         (min_fp_value * range_tolerance))
                if int(min_anomalous_value) in range(lower_min_fp_value,
                                                     upper_min_fp_value):
                    lower_range_similar = True
                    print(
                        'min value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other'
                        % (str(min_fp_value), str(min_anomalous_value),
                           str(range_tolerance_percentage)))
        if not lower_range_similar:
            print(
                'lower range of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar'
                % (str(min_fp_value), str(min_anomalous_value)))
        upper_range_not_same = True
        try:
            if int(max_fp_value) == int(max_anomalous_value):
                upper_range_not_same = False
                upper_range_similar = True
                print(
                    'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are the same'
                    % (str(max_fp_value), str(max_anomalous_value)))
        except:
            upper_range_not_same = True
        if max_fp_value and max_anomalous_value and lower_range_similar and upper_range_not_same:
            # @added 20180717 - Task #2446: Optimize Ionosphere
            #                   Feature #2404: Ionosphere - fluid approximation
            # On low values such as 1 and 2, the range_tolerance
            # should be adjusted to account for the very small
            # range. TODO
            lower_max_fp_value = int(max_fp_value -
                                     (max_fp_value * range_tolerance))
            upper_max_fp_value = int(max_fp_value +
                                     (max_fp_value * range_tolerance))
            if int(max_anomalous_value) in range(lower_max_fp_value,
                                                 upper_max_fp_value):
                upper_range_similar = True
                print(
                    'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are similar within %s percent of each other'
                    % (str(max_fp_value), str(max_anomalous_value),
                       str(range_tolerance_percentage)))
            else:
                print(
                    'max value of fp_id_metric_ts (%s) and anomalous_timeseries (%s) are not similar'
                    % (str(max_fp_value), str(max_anomalous_value)))
    if lower_range_similar and upper_range_similar:
        range_similar = True
    else:
        print(
            'the ranges of fp_id_metric_ts and anomalous_timeseries differ significantly Min-Max scaling will be skipped'
        )

    minmax_fp_ts = []
    # if fp_id_metric_ts:
    if range_similar:
        try:
            minmax_fp_values = [x[1] for x in fp_id_metric_ts]
            x_np = np.asarray(minmax_fp_values)
            # Min-Max scaling
            np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min())
            for (ts, v) in zip(fp_id_metric_ts, np_minmax):
                minmax_fp_ts.append([ts[0], v])
            print(
                'minmax_fp_ts list populated with the minmax scaled time series with %s data points'
                % str(len(minmax_fp_ts)))
            del minmax_fp_values
        except:
            print(
                'error :: could not minmax scale fp id %s time series for %s' %
                (str(fp_id), str(base_name)))
        if not minmax_fp_ts:
            print('error :: minmax_fp_ts list not populated')

    minmax_anomalous_ts = []
    if minmax_fp_ts:
        # Only process if they are approximately the same length
        minmax_fp_ts_values_count = len(minmax_fp_ts)
        if minmax_fp_ts_values_count - anomalous_ts_values_count in range(
                -14, 14):
            try:
                minmax_anomalous_values = [
                    x2[1] for x2 in anomalous_timeseries
                ]
                x_np = np.asarray(minmax_anomalous_values)
                # Min-Max scaling
                np_minmax = (x_np - x_np.min()) / (x_np.max() - x_np.min())
                for (ts, v) in zip(fp_id_metric_ts, np_minmax):
                    minmax_anomalous_ts.append([ts[0], v])
                del anomalous_timeseries
                del minmax_anomalous_values
            except:
                print(
                    'error :: could not minmax scale current time series anomalous_timeseries for %s'
                    % (str(fp_id), str(base_name)))
            if len(minmax_anomalous_ts) > 0:
                print('minmax_anomalous_ts is populated with %s data points' %
                      str(len(minmax_anomalous_ts)))
            else:
                print('error :: minmax_anomalous_ts is not populated')
        else:
            print(
                'minmax scaled check will be skipped - anomalous_ts_values_count is %s and minmax_fp_ts is %s'
                % (str(anomalous_ts_values_count),
                   str(minmax_fp_ts_values_count)))

    minmax_fp_ts_csv = '%s/fpid.%s.%s.minmax_fp_ts.tsfresh.input.std.csv' % (
        settings.SKYLINE_TMP_DIR, str(fp_id), base_name)
    if os.path.isfile(minmax_fp_ts_csv):
        os.remove(minmax_fp_ts_csv)
    minmax_fp_fname_out = minmax_fp_ts_csv + '.transposed.csv'
    if os.path.isfile(minmax_fp_fname_out):
        os.remove(minmax_fp_fname_out)
    anomalous_ts_csv = '%s/%s.%s.minmax_anomalous_ts.tsfresh.std.csv' % (
        settings.SKYLINE_TMP_DIR, metric_timestamp, base_name)
    if os.path.isfile(anomalous_ts_csv):
        os.remove(anomalous_ts_csv)
    anomalous_fp_fname_out = anomalous_ts_csv + '.transposed.csv'
    if os.path.isfile(anomalous_fp_fname_out):
        os.remove(anomalous_fp_fname_out)

    tsf_settings = ReasonableFeatureExtractionSettings()
    tsf_settings.disable_progressbar = True
    minmax_fp_features_sum = None
    minmax_anomalous_features_sum = None
    if minmax_anomalous_ts and minmax_fp_ts:
        if not os.path.isfile(minmax_fp_ts_csv):
            datapoints = minmax_fp_ts
            converted = []
            for datapoint in datapoints:
                try:
                    new_datapoint = [float(datapoint[0]), float(datapoint[1])]
                    converted.append(new_datapoint)
                except:  # nosec
                    continue
            for ts, value in converted:
                try:
                    utc_ts_line = '%s,%s,%s\n' % (base_name, str(
                        int(ts)), str(value))
                    with open(minmax_fp_ts_csv, 'a') as fh:
                        fh.write(utc_ts_line)
                except:
                    print('error :: could not write to file %s' %
                          (str(minmax_fp_ts_csv)))
            del converted
        else:
            print('file found %s, using for data' % minmax_fp_ts_csv)

        if not os.path.isfile(minmax_fp_ts_csv):
            print('error :: file not found %s' % minmax_fp_ts_csv)
        else:
            print(
                'file exists to create the minmax_fp_ts data frame from - %s' %
                minmax_fp_ts_csv)

        try:
            df = pd.read_csv(minmax_fp_ts_csv,
                             delimiter=',',
                             header=None,
                             names=['metric', 'timestamp', 'value'])
            df.columns = ['metric', 'timestamp', 'value']
        except:
            print('error :: failed to created data frame from %s' %
                  (str(minmax_fp_ts_csv)))
        try:
            df_features = extract_features(
                # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
                # df, column_id='metric', column_sort='timestamp', column_kind=None,
                # column_value=None, feature_extraction_settings=tsf_settings)
                df,
                default_fc_parameters=EfficientFCParameters(),
                column_id='metric',
                column_sort='timestamp',
                column_kind=None,
                column_value=None,
                disable_progressbar=True)
        except:
            print('error :: failed to created df_features from %s' %
                  (str(minmax_fp_ts_csv)))
        # Create transposed features csv
        if not os.path.isfile(minmax_fp_fname_out):
            # Transpose
            df_t = df_features.transpose()
            df_t.to_csv(minmax_fp_fname_out)

        try:
            # Calculate the count and sum of the features values
            df_sum = pd.read_csv(minmax_fp_fname_out,
                                 delimiter=',',
                                 header=0,
                                 names=['feature_name', 'value'])
            df_sum.columns = ['feature_name', 'value']
            df_sum['feature_name'] = df_sum['feature_name'].astype(str)
            df_sum['value'] = df_sum['value'].astype(float)
            minmax_fp_features_count = len(df_sum['value'])
            minmax_fp_features_sum = df_sum['value'].sum()
            print('minmax_fp_ts - features_count: %s, features_sum: %s' %
                  (str(minmax_fp_features_count), str(minmax_fp_features_sum)))
            del df_sum
        except:
            print('error :: failed to created df_sum from %s' %
                  (str(minmax_fp_fname_out)))

        if minmax_fp_features_count > 0:
            print(
                'debug :: minmax_fp_features_count of the minmax_fp_ts is %s' %
                str(minmax_fp_features_count))
        else:
            print('error :: minmax_fp_features_count is %s' %
                  str(minmax_fp_features_count))

        if not os.path.isfile(anomalous_ts_csv):
            datapoints = minmax_anomalous_ts
            converted = []
            for datapoint in datapoints:
                try:
                    new_datapoint = [float(datapoint[0]), float(datapoint[1])]
                    converted.append(new_datapoint)
                except:  # nosec
                    continue
            for ts, value in converted:
                utc_ts_line = '%s,%s,%s\n' % (base_name, str(
                    int(ts)), str(value))
                with open(anomalous_ts_csv, 'a') as fh:
                    fh.write(utc_ts_line)
            del converted

        df = pd.read_csv(anomalous_ts_csv,
                         delimiter=',',
                         header=None,
                         names=['metric', 'timestamp', 'value'])
        df.columns = ['metric', 'timestamp', 'value']
        df_features_current = extract_features(
            # @modified 20210101 - Task #3928: Update Skyline to use new tsfresh feature extraction method
            # df, column_id='metric', column_sort='timestamp', column_kind=None,
            # column_value=None, feature_extraction_settings=tsf_settings)
            df,
            default_fc_parameters=EfficientFCParameters(),
            column_id='metric',
            column_sort='timestamp',
            column_kind=None,
            column_value=None,
            disable_progressbar=True)
        del df

        # Create transposed features csv
        if not os.path.isfile(anomalous_fp_fname_out):
            # Transpose
            df_t = df_features_current.transpose()
            df_t.to_csv(anomalous_fp_fname_out)
            del df_t
            del df_features_current
        # Calculate the count and sum of the features values
        df_sum_2 = pd.read_csv(anomalous_fp_fname_out,
                               delimiter=',',
                               header=0,
                               names=['feature_name', 'value'])
        df_sum_2.columns = ['feature_name', 'value']
        df_sum_2['feature_name'] = df_sum_2['feature_name'].astype(str)
        df_sum_2['value'] = df_sum_2['value'].astype(float)
        minmax_anomalous_features_count = len(df_sum_2['value'])
        minmax_anomalous_features_sum = df_sum_2['value'].sum()
        print(
            'minmax_anomalous_ts - minmax_anomalous_features_count: %s, minmax_anomalous_features_sum: %s'
            % (str(minmax_anomalous_features_count),
               str(minmax_anomalous_features_sum)))
        del df_sum_2
        del minmax_anomalous_ts

    percent_different = 100
    if minmax_fp_features_sum and minmax_anomalous_features_sum:
        percent_different = None
        # @modified 20210425 - Task #4030: refactoring
        #                      Feature #4014: Ionosphere - inference
        # Use the common function added
        # try:
        #     fp_sum_array = [minmax_fp_features_sum]
        #     calc_sum_array = [minmax_anomalous_features_sum]
        #     percent_different = 100
        #     sums_array = np.array([minmax_fp_features_sum, minmax_anomalous_features_sum], dtype=float)
        #     calc_percent_different = np.diff(sums_array) / sums_array[:-1] * 100.
        #     percent_different = calc_percent_different[0]
        #     print('percent_different between minmax scaled features sums - %s' % str(percent_different))
        # except:
        #     print('error :: failed to calculate percent_different from minmax scaled features sums')
        try:
            percent_different = get_percent_different(
                minmax_fp_features_sum, minmax_anomalous_features_sum, True)
            print(
                'percent_different between minmax_fp_features_sum and minmax_anomalous_features_sum - %s'
                % str(percent_different))
        except Exception as e:
            print('error :: failed to calculate percent_different - %s' % e)

        if percent_different:
            almost_equal = None
            try:
                # np.testing.assert_array_almost_equal(fp_sum_array, calc_sum_array)
                np.testing.assert_array_almost_equal(
                    minmax_fp_features_sum, minmax_anomalous_features_sum)
                almost_equal = True
            except:
                almost_equal = False

            if almost_equal:
                minmax_not_anomalous = True
                print(
                    'minmax scaled common features sums are almost equal, not anomalous'
                )

            # if diff_in_sums <= 1%:
            if percent_different < 0:
                new_pdiff = percent_different * -1
                percent_different = new_pdiff

            # @modified 20190321
            # if percent_different < (settings.IONOSPHERE_FEATURES_PERCENT_SIMILAR + 1):
            if percent_different < IONOSPHERE_ECHO_MINMAX_SCALING_FEATURES_PERCENT_SIMILAR:
                minmax_not_anomalous = True
                # log
                print(
                    'not anomalous - minmax scaled features profile match - %s - %s'
                    % (base_name, str(minmax_not_anomalous)))
                print(
                    'minmax scaled calculated features sum are within %s percent of fp_id %s with %s, not anomalous'
                    % (str(
                        IONOSPHERE_ECHO_MINMAX_SCALING_FEATURES_PERCENT_SIMILAR
                    ), str(fp_id), str(percent_different)))
            if minmax_not_anomalous:
                not_anomalous = True
                minmax = 1
                # Created time series resources for graphing in
                # the matched page

    try:
        clean_file = anomalous_ts_csv
        if os.path.isfile(anomalous_ts_csv):
            os.remove(anomalous_ts_csv)
        # print('cleaned up - %s' % clean_file)
    except:
        print('no anomalous_ts_csv file to clean up')
    try:
        clean_file = anomalous_fp_fname_out
        if os.path.isfile(anomalous_fp_fname_out):
            os.remove(anomalous_fp_fname_out)
        # print('cleaned up - %s' % clean_file)
    except:
        print('no anomalous_fp_fname_out file to clean up')
    return not_anomalous
Exemplo n.º 4
0
def on_demand_motif_analysis(metric, timestamp, similarity, batch_size,
                             top_matches, max_distance, range_padding,
                             max_area_percent_diff):
    """
    Process a motif similarity search on demand
    """
    import numpy as np
    import mass_ts as mts

    logger = logging.getLogger(skyline_app_logger)
    dev_null = None
    function_str = 'on_demand_motif_analysis'
    logger.info(
        '%s :: with parameters :: metric: %s, timestamp: %s, similarity: %s, batch_size:%s, top_matches: %s, max_distance: %s, range_padding: %s, max_area_percent_diff: %s'
        % (function_str, str(metric), str(timestamp), str(similarity),
           str(batch_size), str(top_matches), str(max_distance),
           str(range_padding), str(max_area_percent_diff)))
    trace = 'none'
    fail_msg = 'none'

    start = time.time()
    start_timer = timer()
    metric_vars_dict = {}
    metric_id = 0
    fp_ids = []
    timeseries = []
    not_similar_enough_sample = 0
    not_similar_motifs = 0
    similar_motifs = 0
    exact_motifs = 0
    distance_motifs = 0
    motifs_found = []
    find_exact_matches_run = False
    exact_matches_found = []
    fps_timeseries = {}
    # A motif_analysis dict to add to and return
    motif_analysis = {}
    motif_analysis[metric] = {}
    motif_analysis[metric]['timestamp'] = int(timestamp)
    motif_analysis[metric]['started'] = start
    motif_analysis[metric]['motifs'] = {}
    motif_analysis[metric]['exact_motifs'] = exact_motifs
    motif_analysis[metric]['similar_motifs'] = similar_motifs
    motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs
    motif_analysis[metric][
        'not_similar_enough_sample'] = not_similar_enough_sample
    # @added 20210417 - Feature #4014: Ionosphere - inference
    # Allow the user to define the batch_size per similarity search
    motif_analysis[metric]['batch_size'] = int(batch_size)
    motif_analysis[metric]['top_matches'] = int(top_matches)
    motif_analysis[metric]['max_distance'] = float(max_distance)
    # @added 20210425 - Feature #4014: Ionosphere - inference
    # Added max_area_percent_diff for computing the area under the curve
    motif_analysis[metric]['max_area_percent_diff'] = float(
        max_area_percent_diff)

    fps_checked_for_motifs = []

    metric_dir = metric.replace('.', '/')
    metric_timeseries_dir = '%s/%s/%s' % (settings.IONOSPHERE_DATA_FOLDER,
                                          str(timestamp), metric_dir)

    # @added 20210418 - Feature #4014: Ionosphere - inference
    # Allow for the similarity search on saved_training_data
    if 'saved_training_data' in request.args:
        saved_training_data_str = request.args.get('saved_training_data',
                                                   'false')
        if saved_training_data_str == 'true':
            saved_metric_timeseries_dir = '%s_saved/%s/%s' % (
                settings.IONOSPHERE_DATA_FOLDER, str(timestamp), metric_dir)
            if path.exists(saved_metric_timeseries_dir):
                metric_timeseries_dir = saved_metric_timeseries_dir
                logger.info('%s :: using saved training_data dir - %s' %
                            (function_str, saved_metric_timeseries_dir))

    metric_vars_file = '%s/%s.txt' % (metric_timeseries_dir, metric)
    timeseries_json = '%s/%s.json' % (metric_timeseries_dir, metric)
    full_duration_in_hours = int(settings.FULL_DURATION / 60 / 60)
    full_duration_timeseries_json = '%s/%s.mirage.redis.%sh.json' % (
        metric_timeseries_dir, metric, str(full_duration_in_hours))
    try:
        metric_vars_dict = mirage_load_metric_vars(skyline_app,
                                                   metric_vars_file, True)
    except Exception as e:
        logger.error(
            'error :: inference :: failed to load metric variables from check file - %s - %s'
            % (metric_vars_file, e))
    if not metric_vars_dict:
        motif_analysis[metric]['status'] = 'error'
        motif_analysis[metric][
            'reason'] = 'could not load training data variables'
        return motif_analysis

    full_duration = metric_vars_dict['metric_vars']['full_duration']

    # Determine the metric details from the database
    metric_id = 0
    metric_db_object = {}
    try:
        metric_db_object = get_metrics_db_object(metric)
    except Exception as e:
        logger.error('error :: %s :: failed to get_metrics_db_object - %s' %
                     (function_str, e))
    try:
        metric_id = int(metric_db_object['id'])
    except Exception as e:
        logger.error(
            'error :: %s :: failed to determine metric_id from metric_db_object %s - %s'
            % (function_str, str(metric_db_object), e))
        metric_id = 0
    if not metric_id:
        logger.error(
            'error :: %s :: failed to get metric id for %s from the database' %
            (function_str, str(metric)))
        fail_msg = 'failed to get metric id'
        motif_analysis[metric]['status'] = 'error'
        motif_analysis[metric]['reason'] = 'could not determine metric id'
        return motif_analysis, fail_msg, trace

    # @modified 20210419 - Feature #4014: Ionosphere - inference
    # Create a unique dir for each batch_size max_distance
    # motif_images_dir = '%s/motifs' % metric_timeseries_dir
    motif_images_dir = '%s/motifs/batch_size.%s/top_matches.%s/max_distance.%s' % (
        metric_timeseries_dir, str(batch_size), str(top_matches),
        str(max_distance))

    if not path.exists(motif_images_dir):
        # provision motifs image resources
        mkdir_p(motif_images_dir)

    full_durations = [full_duration]
    if path.isfile(full_duration_timeseries_json):
        full_durations = [full_duration, settings.FULL_DURATION]
    logger.info('%s :: full_durations - %s' %
                (function_str, str(full_durations)))

    # Loop through analysis per full_duration
    for full_duration in full_durations:
        start_full_duration = timer()
        fp_ids = []
        try:
            query = 'SELECT id,last_matched from ionosphere WHERE metric_id=%s AND full_duration=%s AND enabled=1 ORDER BY last_matched DESC' % (
                str(metric_id), str(full_duration))
            results = mysql_select(skyline_app, query)
            for row in results:
                fp_ids.append(int(row[0]))
        except Exception as e:
            logger.error(
                'error :: %s :: failed to get fp ids via mysql_select from %s - %s'
                % (function_str, metric, e))

        logger.info('%s :: metric_id: %s, full_duration: %s, fp_ids: %s' %
                    (function_str,
                     (metric_id), str(full_duration), str(fp_ids)))

        if not fp_ids:
            continue

        # Now there are known fps, load the timeseries
        if full_duration == settings.FULL_DURATION:
            timeseries_json_file = full_duration_timeseries_json
        else:
            timeseries_json_file = timeseries_json
        try:
            with open((timeseries_json_file), 'r') as f:
                raw_timeseries = f.read()
            timeseries_array_str = str(raw_timeseries).replace('(',
                                                               '[').replace(
                                                                   ')', ']')
            del raw_timeseries
            timeseries = literal_eval(timeseries_array_str)
            del timeseries_array_str
        except Exception as e:
            logger.error(
                'error :: %s :: failed to load timeseries for %s from %s - %s'
                % (function_str, metric, timeseries_json_file, e))
            continue

        anomalous_timeseries_subsequence = []
        for timestamp_float, value in timeseries[-int(batch_size):]:
            anomalous_timeseries_subsequence.append(
                [int(timestamp_float), value])

        logger.info(
            '%s :: looking for motif in trained fps of full_duration: %s' %
            (function_str, (full_duration)))
        dataset = [float(item[1]) for item in anomalous_timeseries_subsequence]

        max_y = max(dataset)
        min_y = min(dataset)

        # full_y_range = max_y - min_y

        # range_padding_percent = range_padding
        # This was just a test that did not have the desired results
        # if full_y_range < 10:
        #     range_padding_percent = 35
        # if full_y_range < 5:
        #     range_padding_percent = 75
        # if full_y_range < 2:
        #    range_padding_percent = 100

        use_range_padding = ((max_y - min_y) / 100) * range_padding
        if min_y > 0 and (min_y - use_range_padding) > 0:
            min_y_padded = min_y - use_range_padding
        else:
            min_y_padded = min_y
        max_y_padded = max_y + use_range_padding
        if min_y_padded == max_y_padded:
            min_y_padded = min_y_padded - (
                (min_y_padded / 100) * range_padding)
            max_y_padded = max_y_padded + (
                (max_y_padded / 100) * range_padding)

        # anomalous_ts = np.array(dataset)
        anomalous_ts = dataset

        mass2_batch_times = []
        exact_match_times = []

        nan = np.array([np.nan])
        nanj = complex(0.0, float('nan'))
        empty_dists = np.array(nan + nanj)

        # plotted = False
        count = 0

        # fp_ids = [fp_id for index, fp_id in enumerate(fp_ids) if index == 0]

        # motifs_found = []
        # exact_matches_found = []
        # fps_timeseries = {}

        for fp_id in fp_ids:
            if (time.time() - start) >= 20:
                break
            # Attempt to surface the fp timeseries from memcache and/or db
            # @modified 20210424 - Feature #4014: Ionosphere - inference
            #                      Task #4030: refactoring
            fp_timeseries = None
            try:
                fp_timeseries = get_fp_timeseries(skyline_app, metric_id,
                                                  fp_id)
            except Exception as e:
                logger.error(
                    'inference :: did not get fp timeseries with get_fp_timeseries(%s, %s, %s) - %s'
                    % (skyline_app, str(metric_id), str(fp_id), e))
            if not fp_timeseries:
                continue

            relate_dataset = [float(item[1]) for item in fp_timeseries]

            fps_timeseries[fp_id] = fp_timeseries

            current_best_indices = []
            current_best_dists = []
            best_indices = None
            best_dists = None

            try:
                logger.info(
                    '%s :: running mts.mass2_batch fp_id: %s, full_duration: %s, batch_size: %s, top_matches: %s, max_distance: %s, motif_size: %s'
                    % (function_str, str(fp_id), str(full_duration),
                       str(batch_size), str(top_matches), str(max_distance),
                       str(len(anomalous_ts))))

                # @added 20210418 - Feature #4014: Ionosphere - inference
                # Handle top_matches being greater than possible kth that can be found
                # mts.mass2_batch error: kth(=50) out of bounds (16)
                use_top_matches = int(top_matches)
                if (len(fp_timeseries) / int(batch_size)) <= int(top_matches):
                    use_top_matches = round(
                        len(fp_timeseries) / int(batch_size)) - 1
                    if use_top_matches == 2:
                        use_top_matches = 1
                    logger.info(
                        '%s :: adjusting top_matches to %s (the maximum possible top - 1) as kth(=%s) will be out of bounds mts.mass2_batch'
                        %
                        (function_str, str(use_top_matches), str(top_matches)))

                start_mass2_batch = timer()
                # @modified 20210418 - Feature #4014: Ionosphere - inference
                # Handle top_matches being greater than possible kth that can be found
                # best_indices, best_dists = mts.mass2_batch(relate_dataset, anomalous_ts, batch_size=int(batch_size), top_matches=int(top_matches))
                best_indices, best_dists = mts.mass2_batch(
                    relate_dataset,
                    anomalous_ts,
                    batch_size=int(batch_size),
                    top_matches=int(use_top_matches))
                end_mass2_batch = timer()
                mass2_batch_times.append((end_mass2_batch - start_mass2_batch))
                current_best_indices = best_indices.tolist()
                current_best_dists = best_dists.tolist()

                # @added 20210412 - Feature #4014: Ionosphere - inference
                #                   Branch #3590: inference
                # Add fp_id to fps_checked_for_motifs to enable ionosphere to update the
                # motif related columns in the ionosphere database table
                fps_checked_for_motifs.append(fp_id)
            except Exception as e:
                logger.error('error :: %s :: %s mts.mass2_batch error: %s' %
                             (function_str, (fp_id), str(e)))
                continue

            try:
                if str(list(best_dists)) == str(list(empty_dists)):
                    logger.info(
                        '%s :: mts.mass2_batch no similar motif from fp id %s - best_dists: %s'
                        % (function_str, (fp_id), str(list(best_dists))))
                    continue
            except Exception as e:
                dev_null = e

            if not current_best_indices[0]:
                continue
            # if list(best_indices)[0] != anomalous_index:
            #     continue
            # If the best_dists is > 1 they are not very similar
            # if list(best_dists)[0].real > 1.0:
            #     continue
            # if list(best_indices)[0] and best_dists:
            for index, best_dist in enumerate(current_best_dists):
                try:
                    motif_added = False
                    """
                    Note: mass_ts finds similar motifs NOT the same motif, the same motif
                    will result in the best_dists being a nan+nanj
                    So it is DIYed
                    """
                    try:
                        # @modified 20210414 - Feature #4014: Ionosphere - inference
                        #                      Branch #3590: inference
                        # Store the not anomalous motifs
                        # motif = [fp_id, current_best_indices[index], best_dist.real]
                        motif = [
                            fp_id, current_best_indices[index], best_dist.real,
                            anomalous_timeseries_subsequence, full_duration
                        ]
                    except Exception as e:
                        dev_null = e
                        motif = []

                    # if list(best_indices)[0] and best_dists:
                    # If it is greater than 1.0 it is not similar
                    # if best_dist.real > 1.0:
                    # if best_dist.real > IONOSPHERE_INFERENCE_MASS_TS_MAX_DISTANCE:
                    if best_dist.real > float(max_distance):
                        continue
                    else:
                        if motif:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                    if not motif_added:
                        if best_dist == nanj:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                    if not motif_added:
                        if str(best_dist) == 'nan+nanj':
                            count += 1
                            motifs_found.append([
                                fp_id, current_best_indices[index], 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                            motif_added = True
                    if not motif_added:
                        if best_dist == empty_dists:
                            count += 1
                            motifs_found.append(motif)
                            motif_added = True
                except Exception as e:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: %s :: could not determine is if fp id %s timeseries at index %s was a match - %s'
                        % (function_str, str(fp_id),
                           str(current_best_indices[index]), e))
                    continue

            # FIND EXACT MATCHES
            # Seeing as I cannot reproduce finding nan+nanj which represents an
            # exact match with mts.mass2_batch, do it DIY style - iterate the
            # timeseries and create a batch_size subsequence for every index and
            # compare the values to the anomalous_ts for an exact match.
            # This takes ~0.024850 seconds on a timeseries with 10079 datapoints
            # @modified 20210418 - Feature #4014: Ionosphere - inference
            # However fiding exact matches can add ~2.5 seconds on 90 minute
            # batch_size and with a proproptionally scaled max_distance of say 15
            # finding an exact match in a longer sequence is less important,
            # the greater the batch_size the most likely greater the variability
            # and the chance of an exact match decreases.  So save 2.5 seconds.
            # UNLESS
            # At a 5 (to 10) batch_size and max_distance of 1.0 an exact match
            # can be found. Exact matches are quite frequent and sometimes with
            # such little variability, similar matchs may not be found.
            # Therefore find find_exact_matches has its place.  MASS
            # A CAVEAT here is that boring metrics and that change and have a
            # low variability even at a larger batch_size could also benefit and
            # possibly achieve better accruracy from the use of find_exact_matches
            # as they can be shapelets resemble a batch_size 5 shapelet.
            # It would perhaps be possible to use one or more of the features
            # profile tsfresh values to identify these types of shapelets, if
            # you knew which feature/s were most descriptive of this type of
            # shapelet, e.g. 'value__skewness': 3.079477685394873, etc (maybe)
            # However I predict that this method will perform worst on these
            # types of shapelets.
            # find_exact_matches = False
            # exact matches can be found in batch sizes of 500 and similar not
            # So actually always run it.
            find_exact_matches = True
            find_exact_matches_run = True

            if int(batch_size) < 10:
                find_exact_matches = True
                find_exact_matches_run = True

            if find_exact_matches:
                try:
                    start_exact_match = timer()
                    indexed_relate_dataset = []
                    for index, item in enumerate(relate_dataset):
                        indexed_relate_dataset.append([index, item])
                    last_index = indexed_relate_dataset[-1][0]
                    current_index = 0
                    while current_index < last_index:
                        subsequence = [
                            value for index, value in
                            indexed_relate_dataset[current_index:(
                                current_index + int(batch_size))]
                        ]
                        if subsequence == anomalous_ts:
                            exact_matches_found.append([
                                fp_id, current_index, 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                            motifs_found.append([
                                fp_id, current_index, 0.0,
                                anomalous_timeseries_subsequence, full_duration
                            ])
                        current_index += 1
                    end_exact_match = timer()
                    exact_match_times.append(
                        (end_exact_match - start_exact_match))
                except Exception as e:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: %s :: could not determine it any exact matches could be found in fp id %s timeseries - %s'
                        % (function_str, str(fp_id), e))

        logger.info(
            '%s :: mts.mass2_batch runs on %s fps of full_duration %s in %.6f seconds'
            % (function_str, str(len(mass2_batch_times)), str(full_duration),
               sum(mass2_batch_times)))
        if find_exact_matches_run:
            logger.info(
                '%s :: exact_match runs on %s fps of full_duration %s in %.6f seconds'
                % (function_str, str(len(exact_match_times)),
                   str(full_duration), sum(exact_match_times)))
        end_full_duration = timer()
        logger.info(
            '%s :: analysed %s fps of full_duration %s in %.6f seconds' %
            (function_str, str(len(fp_ids)), str(full_duration),
             (end_full_duration - start_full_duration)))

        # Patterns are sorted
        sorted_motifs = []
        motifs_found_in_fps = []
        if motifs_found:
            sorted_motifs = sorted(motifs_found, key=lambda x: x[2])
            for item in sorted_motifs:
                motifs_found_in_fps.append(item[0])
        logger.info('%s :: %s motifs found' %
                    (function_str, str(len(sorted_motifs))))

        for motif in sorted_motifs:
            if (time.time() - start) >= 25:
                break
            try:
                add_match = False
                all_in_range = False

                fp_id = motif[0]
                best_index = motif[1]
                best_dist = motif[2]

                # @added 20210414 - Feature #4014: Ionosphere - inference
                #                   Branch #3590: inference
                # Store the not anomalous motifs
                motif_sequence = motif[3]

                motif_full_duration = motif[4]

                match_type = 'not_similar_enough'

                if motif in exact_matches_found:
                    add_match = True
                    match_type = 'exact'
                    all_in_range = True
                    exact_motifs += 1
                full_relate_timeseries = fps_timeseries[fp_id]
                # full_relate_dataset = [float(item[1]) for item in full_relate_timeseries]
                relate_timeseries = [
                    item for index, item in enumerate(full_relate_timeseries)
                    if index >= best_index and index < (best_index +
                                                        int(batch_size))
                ]
                relate_dataset = [item[1] for item in relate_timeseries]

                if not add_match:
                    all_in_range = True
                    for value in relate_dataset:
                        if value < min_y_padded:
                            all_in_range = False
                            break
                        if value > max_y_padded:
                            all_in_range = False
                            break
                    if all_in_range:
                        related_max_y = max(relate_dataset)
                        if related_max_y < (max_y - range_padding):
                            all_in_range = False
                        if related_max_y > (max_y + range_padding):
                            all_in_range = False
                        related_min_y = min(relate_dataset)
                        if related_min_y < (min_y - range_padding):
                            all_in_range = False
                        if related_min_y > (min_y + range_padding):
                            all_in_range = False
                    if all_in_range:
                        logger.info(
                            '%s :: ALL IN RANGE - all_in_range: %s, motif: %s'
                            % (function_str, str(all_in_range),
                               str(relate_dataset[0:2])))
                        add_match = True
                        match_type = 'all_in_range'
                        similar_motifs += 1

                    # @added 20210425 - Feature #4014: Ionosphere - inference
                    # Compute the area using the composite trapezoidal rule.
                    motif_area = None
                    fp_motif_area = None
                    percent_different = None
                    try:
                        batch_size_dataset = [
                            float(item[1]) for item in motif_sequence
                        ]
                        y_motif = np.array(batch_size_dataset)
                        motif_area = np.trapz(y_motif, dx=1)
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get motif_area with np.trapz - %s'
                            % (function_str, e))
                    try:
                        y_fp_motif = np.array(relate_dataset)
                        fp_motif_area = np.trapz(y_fp_motif, dx=1)
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get fp_motif_area with np.trapz - %s'
                            % (function_str, e))
                    # Determine the percentage difference (as a
                    # positive value) of the areas under the
                    # curves.
                    if motif_area and fp_motif_area:
                        percent_different = get_percent_different(
                            fp_motif_area, motif_area, True)
                        if percent_different > max_area_percent_diff:
                            if add_match:
                                logger.info(
                                    '%s :: AREA TOO DIFFERENT - not adding all_in_range match'
                                    % (function_str))
                                add_match = False
                            # BUT ...
                            if best_dist < 3 and not add_match:
                                logger.info(
                                    '%s :: DISTANCE VERY SIMILAR - adding match even though area_percent_diff is greater than max_area_percent_diff because best_dist: %s'
                                    % (function_str, str(best_dist)))
                                add_match = True
                                match_type = 'distance'
                                distance_motifs += 1

                if similarity == 'all':
                    if not add_match:
                        not_similar_motifs += 1
                        if not_similar_enough_sample >= 10:
                            continue
                        not_similar_enough_sample += 1
                        add_match = True
                        match_type = 'not_similar_enough'

                if add_match:
                    generation = 0
                    fp_id_row = None
                    try:
                        fp_id_row = get_ionosphere_fp_db_row(
                            skyline_app, int(fp_id))
                    except Exception as e:
                        logger.error(
                            'error :: %s :: failed to get_ionosphere_fp_db_row for fp_id %s - %s'
                            % (function_str, str(fp_id), e))
                    if fp_id_row:
                        try:
                            generation = fp_id_row['generation']
                        except Exception as e:
                            logger.error(
                                'error :: %s :: failed to generation from fp_id_row for fp_id %s - %s'
                                % (function_str, str(fp_id), e))
                    if generation == 0:
                        generation_str = 'trained'
                    else:
                        generation_str = 'LEARNT'
                    motif_match_types = motif_match_types_dict()
                    type_id = motif_match_types[match_type]

                    motif_id = '%s-%s' % (str(fp_id), str(best_index))
                    motif_analysis[metric]['motifs'][motif_id] = {}
                    motif_analysis[metric]['motifs'][motif_id][
                        'metric_id'] = metric_id
                    motif_analysis[metric]['motifs'][motif_id]['fp_id'] = fp_id
                    motif_analysis[metric]['motifs'][motif_id][
                        'generation'] = generation
                    motif_analysis[metric]['motifs'][motif_id][
                        'index'] = best_index
                    motif_analysis[metric]['motifs'][motif_id][
                        'distance'] = best_dist
                    motif_analysis[metric]['motifs'][motif_id]['size'] = int(
                        batch_size)
                    motif_analysis[metric]['motifs'][motif_id][
                        'max_distance'] = float(max_distance)
                    motif_analysis[metric]['motifs'][motif_id][
                        'timestamp'] = timestamp
                    motif_analysis[metric]['motifs'][motif_id][
                        'type_id'] = type_id
                    motif_analysis[metric]['motifs'][motif_id][
                        'type'] = match_type
                    motif_analysis[metric]['motifs'][motif_id][
                        'full_duration'] = motif_full_duration
                    # @added 20210414 - Feature #4014: Ionosphere - inference
                    #                   Branch #3590: inference
                    # Store the not anomalous motifs
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_timeseries'] = anomalous_timeseries_subsequence
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_sequence'] = motif_sequence
                    not_anomalous_timestamp = int(
                        anomalous_timeseries_subsequence[-1][0])
                    graph_period_seconds = not_anomalous_timestamp - int(
                        anomalous_timeseries_subsequence[0][0])
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_period_seconds'] = graph_period_seconds
                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_period_minutes'] = round(graph_period_seconds /
                                                        60)

                    motif_analysis[metric]['motifs'][motif_id]['image'] = None

                    motif_analysis[metric]['motifs'][motif_id][
                        'motif_area'] = motif_area
                    motif_analysis[metric]['motifs'][motif_id][
                        'fp_motif_area'] = fp_motif_area
                    motif_analysis[metric]['motifs'][motif_id][
                        'area_percent_diff'] = percent_different
                    motif_analysis[metric]['motifs'][motif_id][
                        'max_area_percent_diff'] = max_area_percent_diff

                    if (time.time() - start) >= 25:
                        continue

                    graph_image_file = '%s/motif.%s.%s.%s.with_max_distance.%s.png' % (
                        motif_images_dir, motif_id, match_type,
                        str(batch_size), str(max_distance))
                    plotted_image = False
                    on_demand_motif_analysis = True
                    if not path.isfile(graph_image_file):
                        plotted_image, plotted_image_file = plot_motif_match(
                            skyline_app, metric, timestamp, fp_id,
                            full_duration, generation_str, motif_id,
                            best_index, int(batch_size), best_dist, type_id,
                            relate_dataset, anomalous_timeseries_subsequence,
                            graph_image_file, on_demand_motif_analysis)
                    else:
                        plotted_image = True
                        logger.info('%s :: plot already exists - %s' %
                                    (function_str, str(graph_image_file)))
                    if plotted_image:
                        motif_analysis[metric]['motifs'][motif_id][
                            'image'] = graph_image_file
                    else:
                        logger.error('failed to plot motif match plot')
                        graph_image_file = None
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: inference :: with fp id %s proceesing motif at index: %s - %s'
                    % (str(fp_id), str(motif[0]), str(e)))
                continue
    end_timer = timer()
    motif_analysis[metric]['fps_checked'] = fps_checked_for_motifs
    motif_analysis[metric]['exact_motifs'] = exact_motifs
    motif_analysis[metric]['similar_motifs'] = similar_motifs
    motif_analysis[metric]['distance_motifs'] = distance_motifs
    motif_analysis[metric]['not_similar_motifs'] = not_similar_motifs
    motif_analysis[metric][
        'not_similar_enough_sample'] = not_similar_enough_sample

    motif_analysis_file = '%s/motif.analysis.similarity_%s.batch_size_%s.top_matches_%s.max_distance_%s.dict' % (
        motif_images_dir, similarity, str(batch_size), str(top_matches),
        str(max_distance))
    try:
        write_data_to_file(skyline_app, motif_analysis_file, 'w',
                           str(motif_analysis))
    except Exception as e:
        trace = traceback.format_exc()
        logger.error('%s' % trace)
        fail_msg = '%s :: error :: failed to write motif_analysis_file - %s' % (
            function_str, motif_analysis_file)
        logger.error('%s' % fail_msg)
        dev_null = e

    motif_ids = list(motif_analysis[metric]['motifs'].keys())
    logger.info(
        '%s :: %s motif matches found, %s fps where checked and motifs plotted in %.6f seconds for %s'
        % (function_str, str(len(motif_ids)), str(len(fps_checked_for_motifs)),
           (end_timer - start_timer), metric))
    if dev_null:
        del dev_null
    return motif_analysis, fail_msg, trace