Exemplo n.º 1
0
def arundo_adtk(data_path):
    data = get_data(data_path)
    data['date'] = data['timestamp'].apply(
        lambda i: datetime.fromtimestamp(i))  # 时间转换
    s_train = data[['date', 'value']]

    # 设置索引项
    s_train = s_train.set_index('date')
    s_train = validate_series(s_train)
    print(s_train)
    # plot(s_train)

    # STL分解+离群点检测
    steps = [("deseasonal", STLDecomposition(freq=20)),
             ("quantile_ad", QuantileAD(high=0.9997, low=0.005))]
    pipeline = Pipeline(steps)
    anomalies = pipeline.fit_detect(s_train)
    print(anomalies)
    # plot(s_train, anomaly_pred=anomalies, ap_color='red', ap_marker_on_curve=True)

    # 绘制检测结果]
    known_anomalies = data.loc[data['label'] == 1]
    known_anomalies = known_anomalies[['date', 'label']]
    known_anomalies = known_anomalies.set_index('date')
    known_anomalies = to_events(known_anomalies)
    print(known_anomalies)
    plot(s_train,
         anomaly_true=known_anomalies,
         anomaly_pred=anomalies,
         ap_color='red',
         ap_marker_on_curve=True,
         at_color="orange")

    plt.savefig(img_path + "arundo_adtk.png", dpi=1000)
    plt.show()
Exemplo n.º 2
0
def get_missing_stats_data(filtered_data):
    ## plotting the outliers for each node and saving the stats of gaps, filled sequences to a csv file
    s1 = None
    outlier_list = []
    try:
        list_dict = []
        list_cols = list(filtered_data.columns)
        list_cols.remove('date')
        for col in list_cols:
            d1 = {}
            sample = filtered_data[col]
            sample = sample.dropna()
            if not sample.empty and sample.shape[0] > 10:
                sample.index = pd.to_datetime(sample.index)
                s1 = sample.resample('H').mean()
                d1['site'] = col.split("n=")[0]
                node_id = col.split("n=")[1].split("_")[0]
                d1['depth'] = col.split("n=")[1].split("_d=")[1]

                s = validate_series(s1)
                iqr_ad = InterQuartileRangeAD(c=1.5)
                anomalies = iqr_ad.fit_detect(s)
                plot(s,
                     anomaly=anomalies,
                     ts_linewidth=1,
                     ts_markersize=3,
                     anomaly_markersize=5,
                     anomaly_color='red',
                     anomaly_tag="marker")
                plt.figure(figsize=(20, 10))
                fig = plt.gcf()
                plt.xlabel('date')
                plt.ylabel('soil moisture')
                plt.title("SITE = " + d1['site'] + ",  NODE_ID = " + node_id +
                          ",  DEPTH = " + d1['depth'])
                fig.savefig(hourly_visualisations + col + ".png")
                plt.close('all')
                d1['node_id'] = node_id
                d1['total_rows'] = s1.shape[0]
                d1['start'] = s1.index[0].date()
                d1['end'] = s1.index[-1].date()
                max_long_seq, longest_gap, num_gaps = find_longest_seq_count(
                    s1)
                d1['nulls'] = s1.isnull().sum()
                d1['filled_percentage'] = 1 - d1['nulls'] / float(
                    d1['total_rows'])
                d1['longest_gap'] = longest_gap
                d1['longest_seq'] = max_long_seq
                d1['num_gaps'] = num_gaps
                list_dict.append(d1)
        with open(hourly_node_wise_data_path, 'w', newline="") as f:
            writer = csv.DictWriter(f, [
                'site', 'node_id', 'depth', 'start', 'end', 'nulls',
                'total_rows', 'filled_percentage', 'longest_gap',
                'longest_seq', 'num_gaps'
            ])
            writer.writeheader()
            writer.writerows(list_dict)
    except Exception as e:
        print(e)
Exemplo n.º 3
0
def anomaly_plot(data, anomaly_true, anomaly_pred):
    """
    Plot time series and/or anomalies.
    """
    plot(data,
         anomaly={
             "anomaly_true": anomaly_true,
             "anomaly_pred": anomaly_pred
         },
         ts_linewidth=1,
         ts_markersize=3,
         anomaly_color={
             "anomaly_true": 'blue',
             "anomaly_pred": 'red'
         },
         anomaly_alpha=0.3,
         curve_group='all')
    plt.show()
Exemplo n.º 4
0
def checkOutlier(data):
    dataCopy = copy.deepcopy(data)
    # 修改为时间序列索引
    dataCopy['时间'] = pd.to_datetime(dataCopy['时间'], format="%Y%m%d%H%M%S")
    dataCopy.set_index("时间", inplace=True)

    dataCopy = validate_series(dataCopy)
    iqr_ad = InterQuartileRangeAD(c=1.5)
    anomalies = iqr_ad.fit_detect(dataCopy)

    # 可视化异常图,并保存到本地
    for i, index in enumerate(indexes):
        axes = plot(dataCopy[index],
                    anomaly=anomalies[index],
                    ts_linewidth=1,
                    ts_markersize=3,
                    anomaly_markersize=5,
                    anomaly_color='red',
                    anomaly_tag="marker")
        axes[0].set_title(index_names[i], fontsize='xx-large')
        axes[0].legend(['Normal', 'Anomaly'], loc='best', fontsize='x-large')
        axes[0].set_xlabel('time', fontsize='x-large')
        axes[0].set_ylabel('value', fontsize='x-large')
        axes[0].figure.savefig(index + ".png")
    logging.log(logging.DEBUG, 'The abnormal curve has been drawn.')

    # 修改数据并统计异常比率
    rows, columns = anomalies.shape
    count = 0
    for row in range(rows):
        for col in range(columns):
            if anomalies.iloc[row, col]:
                count += 1
                data.iloc[row, col + 1] = math.nan
    if count != 0:
        logging.log(logging.DEBUG, 'There are no outliers.')
    outlierRate = float(count) / (rows * columns)
    return data, outlierRate
Exemplo n.º 5
0
def m66(current_skyline_app, parent_pid, timeseries, algorithm_parameters):
    """
    A time series data points are anomalous if the 6th median is 6 standard
    deviations (six-sigma) from the time series 6th median standard deviation
    and persists for x_windows, where `x_windows = int(window / 2)`.
    This algorithm finds SIGNIFICANT cahngepoints in a time series, similar to
    PELT and Bayesian Online Changepoint Detection, however it is more robust to
    instaneous outliers and more conditionally selective of changepoints.

    :param current_skyline_app: the Skyline app executing the algorithm.  This
        will be passed to the algorithm by Skyline.  This is **required** for
        error handling and logging.  You do not have to worry about handling the
        argument in the scope of the custom algorithm itself,  but the algorithm
        must accept it as the first agrument.
    :param parent_pid: the parent pid which is executing the algorithm, this is
        **required** for error handling and logging.  You do not have to worry
        about handling this argument in the scope of algorithm, but the
        algorithm must accept it as the second argument.
    :param timeseries: the time series as a list e.g. ``[[1578916800.0, 29.0],
        [1578920400.0, 55.0], ... [1580353200.0, 55.0]]``
    :param algorithm_parameters: a dictionary of any required parameters for the
        custom_algorithm and algorithm itself for example:
        ``algorithm_parameters={
            'nth_median': 6,
            'sigma': 6,
            'window': 5,
            'return_anomalies' = True,
        }``
    :type current_skyline_app: str
    :type parent_pid: int
    :type timeseries: list
    :type algorithm_parameters: dict
    :return: True, False or Non
    :rtype: boolean

    Example CUSTOM_ALGORITHMS configuration:

    'm66': {
        'namespaces': [
            'skyline.analyzer.run_time', 'skyline.analyzer.total_metrics',
            'skyline.analyzer.exceptions'
        ],
        'algorithm_source': '/opt/skyline/github/skyline/skyline/custom_algorithms/m66.py',
        'algorithm_parameters': {
            'nth_median': 6, 'sigma': 6, 'window': 5, 'resolution': 60,
            'minimum_sparsity': 0, 'determine_duration': False,
            'return_anomalies': True, 'save_plots_to': False,
            'save_plots_to_absolute_dir': False, 'filename_prefix': False
        },
        'max_execution_time': 1.0
        'consensus': 1,
        'algorithms_allowed_in_consensus': ['m66'],
        'run_3sigma_algorithms': False,
        'run_before_3sigma': False,
        'run_only_if_consensus': False,
        'use_with': ['crucible', 'luminosity'],
        'debug_logging': False,
    },

    """

    # You MUST define the algorithm_name
    algorithm_name = 'm66'

    # Define the default state of None and None, anomalous does not default to
    # False as that is not correct, False is only correct if the algorithm
    # determines the data point is not anomalous.  The same is true for the
    # anomalyScore.
    anomalous = None
    anomalyScore = None

    return_anomalies = False
    anomalies = []
    anomalies_dict = {}
    anomalies_dict['algorithm'] = algorithm_name

    realtime_analysis = False

    current_logger = None
    dev_null = None

    # If you wanted to log, you can but this should only be done during
    # testing and development
    def get_log(current_skyline_app):
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        return current_logger

    start = timer()

    # Use the algorithm_parameters to determine the sample_period
    debug_logging = None
    try:
        debug_logging = algorithm_parameters['debug_logging']
    except:
        debug_logging = False
    if debug_logging:
        try:
            current_logger = get_log(current_skyline_app)
            current_logger.debug(
                'debug :: %s :: debug_logging enabled with algorithm_parameters - %s'
                % (algorithm_name, str(algorithm_parameters)))
        except Exception as e:
            # This except pattern MUST be used in ALL custom algortihms to
            # facilitate the traceback from any errors.  The algorithm we want to
            # run super fast and without spamming the log with lots of errors.
            # But we do not want the function returning and not reporting
            # anything to the log, so the pythonic except is used to "sample" any
            # algorithm errors to a tmp file and report once per run rather than
            # spewing tons of errors into the log e.g. analyzer.log
            dev_null = e
            record_algorithm_error(current_skyline_app, parent_pid,
                                   algorithm_name, traceback.format_exc())
            # Return None and None as the algorithm could not determine True or False
            del dev_null
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)

    # Allow the m66 parameters to be passed in the algorithm_parameters
    window = 6
    try:
        window = algorithm_parameters['window']
    except KeyError:
        window = 6
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    nth_median = 6
    try:
        nth_median = algorithm_parameters['nth_median']
    except KeyError:
        nth_median = 6
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    n_sigma = 6
    try:
        n_sigma = algorithm_parameters['sigma']
    except KeyError:
        n_sigma = 6
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    resolution = 0
    try:
        resolution = algorithm_parameters['resolution']
    except KeyError:
        resolution = 0
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    determine_duration = False
    try:
        determine_duration = algorithm_parameters['determine_duration']
    except KeyError:
        determine_duration = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    minimum_sparsity = 0
    try:
        minimum_sparsity = algorithm_parameters['minimum_sparsity']
    except KeyError:
        minimum_sparsity = 0
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    shift_to_start_of_window = True
    try:
        shift_to_start_of_window = algorithm_parameters[
            'shift_to_start_of_window']
    except KeyError:
        shift_to_start_of_window = True
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to = False
    try:
        save_plots_to = algorithm_parameters['save_plots_to']
    except KeyError:
        save_plots_to = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to_absolute_dir = False
    try:
        save_plots_to_absolute_dir = algorithm_parameters[
            'save_plots_to_absolute_dir']
    except KeyError:
        save_plots_to_absolute_dir = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e
    filename_prefix = False
    try:
        filename_prefix = algorithm_parameters['filename_prefix']
    except KeyError:
        filename_prefix = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    if debug_logging:
        current_logger.debug('debug :: algorithm_parameters :: %s' %
                             (str(algorithm_parameters)))

    return_anomalies = False
    try:
        return_anomalies = algorithm_parameters['return_anomalies']
    except KeyError:
        return_anomalies = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    try:
        realtime_analysis = algorithm_parameters['realtime_analysis']
    except KeyError:
        realtime_analysis = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to = False
    try:
        save_plots_to = algorithm_parameters['save_plots_to']
    except KeyError:
        save_plots_to = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to_absolute_dir = False
    try:
        save_plots_to_absolute_dir = algorithm_parameters[
            'save_plots_to_absolute_dir']
    except KeyError:
        save_plots_to = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e
    filename_prefix = False
    try:
        filename_prefix = algorithm_parameters['filename_prefix']
    except KeyError:
        filename_prefix = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    try:
        base_name = algorithm_parameters['base_name']
    except Exception as e:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        dev_null = e
        del dev_null
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (False, None, anomalies)
        return (False, None)
    if debug_logging:
        current_logger.debug('debug :: %s :: base_name - %s' %
                             (algorithm_name, str(base_name)))

    anomalies_dict['metric'] = base_name
    anomalies_dict['anomalies'] = {}

    use_bottleneck = True
    if save_plots_to:
        use_bottleneck = False
    if use_bottleneck:
        import bottleneck as bn

    # ALWAYS WRAP YOUR ALGORITHM IN try and the BELOW except
    try:
        start_preprocessing = timer()

        # INFO: Sorting time series of 10079 data points took 0.002215 seconds
        timeseries = sorted(timeseries, key=lambda x: x[0])
        if debug_logging:
            current_logger.debug('debug :: %s :: time series of length - %s' %
                                 (algorithm_name, str(len(timeseries))))

        # Testing the data to ensure it meets minimum requirements, in the case
        # of Skyline's use of the m66 algorithm this means that:
        # - the time series must have at least 75% of its full_duration
        do_not_use_sparse_data = False
        if current_skyline_app == 'luminosity':
            do_not_use_sparse_data = True

        if minimum_sparsity == 0:
            do_not_use_sparse_data = False

        total_period = 0
        total_datapoints = 0

        calculate_variables = False
        if do_not_use_sparse_data:
            calculate_variables = True
        if determine_duration:
            calculate_variables = True

        if calculate_variables:
            try:
                start_timestamp = int(timeseries[0][0])
                end_timestamp = int(timeseries[-1][0])
                total_period = end_timestamp - start_timestamp
                total_datapoints = len(timeseries)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug(
                        'debug_logging :: %s :: SystemExit called, exiting - %s'
                        % (algorithm_name, e))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid,
                                       algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error(
                        'error :: debug_logging :: %s :: failed to determine total_period and total_datapoints'
                        % (algorithm_name))
                timeseries = []
            if not timeseries:
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)

            if current_skyline_app == 'analyzer':
                # Default for analyzer at required period to 18 hours
                period_required = int(FULL_DURATION * 0.75)
            else:
                # Determine from timeseries
                if total_period < FULL_DURATION:
                    period_required = int(FULL_DURATION * 0.75)
                else:
                    period_required = int(total_period * 0.75)

            if determine_duration:
                period_required = int(total_period * 0.75)

        if do_not_use_sparse_data:
            # If the time series does not have 75% of its full_duration it does
            # not have sufficient data to sample
            try:
                if total_period < period_required:
                    if debug_logging:
                        current_logger.debug(
                            'debug :: %s :: time series does not have sufficient data'
                            % (algorithm_name))
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug(
                        'debug_logging :: %s :: SystemExit called, exiting - %s'
                        % (algorithm_name, e))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid,
                                       algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error(
                        'error :: debug_logging :: %s :: falied to determine if time series has sufficient data'
                        % (algorithm_name))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)

            # If the time series does not have 75% of its full_duration
            # datapoints it does not have sufficient data to sample

            # Determine resolution from the last 30 data points
            # INFO took 0.002060 seconds
            if not resolution:
                resolution_timestamps = []
                metric_resolution = False
                for metric_datapoint in timeseries[-30:]:
                    timestamp = int(metric_datapoint[0])
                    resolution_timestamps.append(timestamp)
                timestamp_resolutions = []
                if resolution_timestamps:
                    last_timestamp = None
                    for timestamp in resolution_timestamps:
                        if last_timestamp:
                            resolution = timestamp - last_timestamp
                            timestamp_resolutions.append(resolution)
                            last_timestamp = timestamp
                        else:
                            last_timestamp = timestamp
                    try:
                        del resolution_timestamps
                    except:
                        pass
                if timestamp_resolutions:
                    try:
                        timestamp_resolutions_count = Counter(
                            timestamp_resolutions)
                        ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common(
                        )
                        metric_resolution = int(
                            ordered_timestamp_resolutions_count[0][0])
                    except SystemExit as e:
                        if debug_logging:
                            current_logger.debug(
                                'debug_logging :: %s :: SystemExit called, exiting - %s'
                                % (algorithm_name, e))
                        if current_skyline_app == 'webapp':
                            return (anomalous, anomalyScore, anomalies,
                                    anomalies_dict)
                        if return_anomalies:
                            return (anomalous, anomalyScore, anomalies)
                        return (anomalous, anomalyScore)
                    except:
                        traceback_msg = traceback.format_exc()
                        record_algorithm_error(current_skyline_app, parent_pid,
                                               algorithm_name, traceback_msg)
                        if debug_logging:
                            current_logger.error(traceback_msg)
                            current_logger.error(
                                'error :: debug_logging :: %s :: failed to determine if time series has sufficient data'
                                % (algorithm_name))
                    try:
                        del timestamp_resolutions
                    except:
                        pass
            else:
                metric_resolution = resolution

            minimum_datapoints = None
            if metric_resolution:
                minimum_datapoints = int(period_required / metric_resolution)
            if minimum_datapoints:
                if total_datapoints < minimum_datapoints:
                    if debug_logging:
                        current_logger.debug(
                            'debug :: %s :: time series does not have sufficient data, minimum_datapoints required is %s and time series has %s'
                            % (algorithm_name, str(minimum_datapoints),
                               str(total_datapoints)))
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)

            # Is the time series fully populated?
            # full_duration_datapoints = int(full_duration / metric_resolution)
            total_period_datapoints = int(total_period / metric_resolution)
            # minimum_percentage_sparsity = 95
            minimum_percentage_sparsity = 90
            sparsity = int(total_datapoints / (total_period_datapoints / 100))
            if sparsity < minimum_percentage_sparsity:
                if debug_logging:
                    current_logger.debug(
                        'debug :: %s :: time series does not have sufficient data, minimum_percentage_sparsity required is %s and time series has %s'
                        % (algorithm_name, str(minimum_percentage_sparsity),
                           str(sparsity)))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)
            if len(set(item[1] for item in timeseries)) == 1:
                if debug_logging:
                    current_logger.debug(
                        'debug :: %s :: time series does not have sufficient variability, all the values are the same'
                        % algorithm_name)
                anomalous = False
                anomalyScore = 0.0
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)

        end_preprocessing = timer()
        preprocessing_runtime = end_preprocessing - start_preprocessing
        if debug_logging:
            current_logger.debug(
                'debug :: %s :: preprocessing took %.6f seconds' %
                (algorithm_name, preprocessing_runtime))

        if not timeseries:
            if debug_logging:
                current_logger.debug('debug :: %s :: m66 not run as no data' %
                                     (algorithm_name))
            anomalies = []
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)
        if debug_logging:
            current_logger.debug('debug :: %s :: timeseries length: %s' %
                                 (algorithm_name, str(len(timeseries))))

        anomalies_dict['timestamp'] = int(timeseries[-1][0])
        anomalies_dict['from_timestamp'] = int(timeseries[0][0])

        start_analysis = timer()
        try:
            # bottleneck is used because it is much faster
            # pd dataframe method (1445 data point - 24hrs): took 0.077915 seconds
            # bottleneck method (1445 data point - 24hrs): took 0.005692 seconds
            # numpy and pandas rolling
            # 2021-07-30 12:37:31 :: 2827897 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 136.93 seconds
            # 2021-07-30 12:44:53 :: 2855884 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 148.82 seconds
            # 2021-07-30 12:48:41 :: 2870822 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 145.62 seconds
            # 2021-07-30 12:55:00 :: 2893634 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 139.00 seconds
            # 2021-07-30 12:59:31 :: 2910443 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 144.80 seconds
            # 2021-07-30 13:02:31 :: 2922928 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 143.35 seconds
            # 2021-07-30 14:12:56 :: 3132457 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 129.25 seconds
            # 2021-07-30 14:22:35 :: 3164370 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 125.72 seconds
            # 2021-07-30 14:28:24 :: 3179687 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 222.43 seconds
            # 2021-07-30 14:33:45 :: 3179687 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 244.00 seconds
            # 2021-07-30 14:36:27 :: 3214047 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 141.10 seconds
            # numpy and bottleneck
            # 2021-07-30 16:41:52 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 73.92 seconds
            # 2021-07-30 16:46:46 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 68.84 seconds
            # 2021-07-30 16:51:48 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 70.55 seconds
            # numpy and bottleneck (passing resolution and not calculating in m66)
            # 2021-07-30 16:57:46 :: 3643253 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 65.59 seconds

            if use_bottleneck:
                if len(timeseries) < 10:
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)

                x_np = np.asarray([x[1] for x in timeseries])
                # Fast Min-Max scaling
                data = (x_np - x_np.min()) / (x_np.max() - x_np.min())

                # m66 - calculate to nth_median
                median_count = 0
                while median_count < nth_median:
                    median_count += 1
                    rolling_median_s = bn.move_median(data, window=window)
                    median = rolling_median_s.tolist()
                    data = median
                    if median_count == nth_median:
                        break

                # m66 - calculate the moving standard deviation for the
                # nth_median array
                rolling_std_s = bn.move_std(data, window=window)
                std_nth_median_array = np.nan_to_num(rolling_std_s,
                                                     copy=False,
                                                     nan=0.0,
                                                     posinf=None,
                                                     neginf=None)
                std_nth_median = std_nth_median_array.tolist()
                if debug_logging:
                    current_logger.debug(
                        'debug :: %s :: std_nth_median calculated with bn' %
                        (algorithm_name))
            else:
                df = pd.DataFrame(timeseries, columns=['date', 'value'])
                df['date'] = pd.to_datetime(df['date'], unit='s')
                datetime_index = pd.DatetimeIndex(df['date'].values)
                df = df.set_index(datetime_index)
                df.drop('date', axis=1, inplace=True)
                original_df = df.copy()
                # MinMax scale
                df = (df - df.min()) / (df.max() - df.min())
                # window = 6
                data = df['value'].tolist()

                if len(data) < 10:
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)

                # m66 - calculate to nth_median
                median_count = 0
                while median_count < nth_median:
                    median_count += 1
                    s = pd.Series(data)
                    rolling_median_s = s.rolling(window).median()
                    median = rolling_median_s.tolist()
                    data = median
                    if median_count == nth_median:
                        break

                # m66 - calculate the moving standard deviation for the
                # nth_median array
                s = pd.Series(data)
                rolling_std_s = s.rolling(window).std()

                nth_median_column = 'std_nth_median_%s' % str(nth_median)
                df[nth_median_column] = rolling_std_s.tolist()
                std_nth_median = df[nth_median_column].fillna(0).tolist()

            # m66 - calculate the standard deviation for the entire nth_median
            # array
            metric_stddev = np.std(std_nth_median)
            std_nth_median_n_sigma = []
            anomalies_found = False

            for value in std_nth_median:
                # m66 - if the value in the 6th median array is > six-sigma of
                # the metric_stddev the datapoint is anomalous
                if value > (metric_stddev * n_sigma):
                    std_nth_median_n_sigma.append(1)
                    anomalies_found = True
                else:
                    std_nth_median_n_sigma.append(0)
            std_nth_median_n_sigma_column = 'std_median_%s_%s_sigma' % (
                str(nth_median), str(n_sigma))
            if not use_bottleneck:
                df[std_nth_median_n_sigma_column] = std_nth_median_n_sigma

            anomalies = []
            # m66 - only label anomalous if the n_sigma triggers are persisted
            # for (window / 2)
            if anomalies_found:
                current_triggers = []
                for index, item in enumerate(timeseries):
                    if std_nth_median_n_sigma[index] == 1:
                        current_triggers.append(index)
                    else:
                        if len(current_triggers) > int(window / 2):
                            for trigger_index in current_triggers:
                                # Shift the anomaly back to the beginning of the
                                # window
                                if shift_to_start_of_window:
                                    anomalies.append(
                                        timeseries[(trigger_index -
                                                    (window * int(
                                                        (nth_median / 2))))])
                                else:
                                    anomalies.append(timeseries[trigger_index])
                        current_triggers = []
                # Process any remaining current_triggers
                if len(current_triggers) > int(window / 2):
                    for trigger_index in current_triggers:
                        # Shift the anomaly back to the beginning of the
                        # window
                        if shift_to_start_of_window:
                            anomalies.append(
                                timeseries[(trigger_index - (window * int(
                                    (nth_median / 2))))])
                        else:
                            anomalies.append(timeseries[trigger_index])
            if not anomalies:
                anomalous = False

            if anomalies:
                anomalous = True
                anomalies_data = []
                anomaly_timestamps = [int(item[0]) for item in anomalies]
                for item in timeseries:
                    if int(item[0]) in anomaly_timestamps:
                        anomalies_data.append(1)
                    else:
                        anomalies_data.append(0)
                if not use_bottleneck:
                    df['anomalies'] = anomalies_data
                anomalies_list = []
                for ts, value in timeseries:
                    if int(ts) in anomaly_timestamps:
                        anomalies_list.append([int(ts), value])
                        anomalies_dict['anomalies'][int(ts)] = value

            if anomalies and save_plots_to:
                try:
                    from adtk.visualization import plot
                    metric_dir = base_name.replace('.', '/')
                    timestamp_dir = str(int(timeseries[-1][0]))
                    save_path = '%s/%s/%s/%s' % (save_plots_to, algorithm_name,
                                                 metric_dir, timestamp_dir)
                    if save_plots_to_absolute_dir:
                        save_path = '%s' % save_plots_to
                    anomalies_dict['file_path'] = save_path
                    save_to_file = '%s/%s.%s.png' % (save_path, algorithm_name,
                                                     base_name)
                    if filename_prefix:
                        save_to_file = '%s/%s.%s.%s.png' % (
                            save_path, filename_prefix, algorithm_name,
                            base_name)
                    save_to_path = os_path_dirname(save_to_file)
                    title = '%s\n%s - median %s %s-sigma persisted (window=%s)' % (
                        base_name, algorithm_name, str(nth_median),
                        str(n_sigma), str(window))

                    if not os_path_exists(save_to_path):
                        try:
                            mkdir_p(save_to_path)
                        except Exception as e:
                            current_logger.error(
                                'error :: %s :: failed to create dir - %s - %s'
                                % (algorithm_name, save_to_path, e))
                    if os_path_exists(save_to_path):
                        try:
                            plot(original_df['value'],
                                 anomaly=df['anomalies'],
                                 anomaly_color='red',
                                 title=title,
                                 save_to_file=save_to_file)
                            if debug_logging:
                                current_logger.debug(
                                    'debug :: %s :: plot saved to - %s' %
                                    (algorithm_name, save_to_file))
                            anomalies_dict['image'] = save_to_file
                        except Exception as e:
                            current_logger.error(
                                'error :: %s :: failed to plot - %s - %s' %
                                (algorithm_name, base_name, e))
                    anomalies_file = '%s/%s.%s.anomalies_list.txt' % (
                        save_path, algorithm_name, base_name)
                    with open(anomalies_file, 'w') as fh:
                        fh.write(str(anomalies_list))
                        # os.chmod(anomalies_file, mode=0o644)
                    data_file = '%s/data.txt' % (save_path)
                    with open(data_file, 'w') as fh:
                        fh.write(str(anomalies_dict))
                except SystemExit as e:
                    if debug_logging:
                        current_logger.debug(
                            'debug_logging :: %s :: SystemExit called during save plot, exiting - %s'
                            % (algorithm_name, e))
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)
                except Exception as e:
                    traceback_msg = traceback.format_exc()
                    record_algorithm_error(current_skyline_app, parent_pid,
                                           algorithm_name, traceback_msg)
                    if debug_logging:
                        current_logger.error(traceback_msg)
                        current_logger.error(
                            'error :: %s :: failed to plot or save anomalies file - %s - %s'
                            % (algorithm_name, base_name, e))

            try:
                del df
            except:
                pass
        except SystemExit as e:
            if debug_logging:
                current_logger.debug(
                    'debug_logging :: %s :: SystemExit called, during analysis, exiting - %s'
                    % (algorithm_name, e))
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)
        except:
            traceback_msg = traceback.format_exc()
            record_algorithm_error(current_skyline_app, parent_pid,
                                   algorithm_name, traceback_msg)
            if debug_logging:
                current_logger.error(traceback_msg)
                current_logger.error(
                    'error :: debug_logging :: %s :: failed to run on ts' %
                    (algorithm_name))
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)

        end_analysis = timer()
        analysis_runtime = end_analysis - start_analysis

        if debug_logging:
            current_logger.debug(
                'debug :: analysis with %s took %.6f seconds' %
                (algorithm_name, analysis_runtime))

        if anomalous:
            anomalyScore = 1.0
        else:
            anomalyScore = 0.0

        if debug_logging:
            current_logger.info(
                '%s :: anomalous - %s, anomalyScore - %s' %
                (algorithm_name, str(anomalous), str(anomalyScore)))

        if debug_logging:
            end = timer()
            processing_runtime = end - start
            current_logger.info('%s :: completed in %.6f seconds' %
                                (algorithm_name, processing_runtime))
        try:
            del timeseries
        except:
            pass
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        return (anomalous, anomalyScore)
    except SystemExit as e:
        if debug_logging:
            current_logger.debug(
                'debug_logging :: %s :: SystemExit called (before StopIteration), exiting - %s'
                % (algorithm_name, e))
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        return (anomalous, anomalyScore)
    except StopIteration:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (False, None, anomalies)
        return (False, None)
    except:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (False, None, anomalies)
        return (False, None)

    if current_skyline_app == 'webapp':
        return (anomalous, anomalyScore, anomalies, anomalies_dict)
    if return_anomalies:
        return (anomalous, anomalyScore, anomalies)
    return (anomalous, anomalyScore)
Exemplo n.º 6
0
        a = model.fit_detect(df.iloc[:, i])
        pd.testing.assert_series_equal(a, a_true.iloc[:, i], check_dtype=False)

    a = model.fit_detect(df)
    pd.testing.assert_frame_equal(a, a_true, check_dtype=False)


if __name__ == '__main__':
    logins_per_week_filter_new = [
        20.539924073521295, 4.709207794178407, 10.567975521943765,
        4.654008712203323, 12.540417874779518, 8.68124128274528,
        14.510354653083763, 18.506005440002554, 8.653713996141803,
        2.6067706565147417, 8.568764060326458, 6.497231646713441,
        2.657897098432535, 2.4346225766375302, 0.6283913785856533,
        0.2832893392222861, 0.5798709739436374, 0.6129681116308826,
        1.1067186413836192, 1.409769359194367, 1.638155640892966
    ]

    from adtk.visualization import plot
    from adtk.detector import PersistAD
    from adtk.data import validate_series

    s = validate_series(logins_per_week_filter_new)
    persist_ad = PersistAD(c=0.5,
                           side='negative',
                           agg='median',
                           lower_threshold=15.0)
    persist_ad.window = 4
    anomalies = persist_ad.fit_detect(s)
    plot(s, anomaly=anomalies, anomaly_color='red')
    from adtk.data import validate_series
    from adtk.visualization import plot
    from adtk.detector import PersistAD
    from adtk.transformer import DoubleRollingAggregate
    from adtk.pipe import Pipeline

##############################################################################################################

##Import and Validate the Dataset
s_train = pd.read_csv("./anomalies.csv",
                      index_col="timestamp",
                      parse_dates=True,
                      squeeze=True)
s_train = validate_series(s_train)
print(s_train)
plot(
    s_train)  ##plot Function Draws a Chart but The Chart Is in JPEG Format !!!

##############################################################################################################

##PersistAD Detects Spikes (Extremely Abnormal Values)
##High Tolerance Model
persist_ad = PersistAD(
    agg='mean', side='both',
    c=6)  ##Side Parameter Filters Positive and Negative Sided Anomalies
anomalies_1 = persist_ad.fit_detect(s_train, return_list=True)
plot(s_train, anomaly=anomalies_1, anomaly_color="red", anomaly_tag="marker")

for i in anomalies_1:
    print(i)
print(len(anomalies_1))
Exemplo n.º 8
0
df.drop(['date'], axis=1, inplace=True)
df.head()
s_train = df

# the same for the label
df = pd.DataFrame(dti, columns=['date'])
df[1] = (Y)
df['datetime'] = pd.to_datetime(df['date'])
df = df.set_index('datetime')
df.drop(['date'], axis=1, inplace=True)
df.head()
from adtk.data import to_events
known_anomalies = to_events(df)

from adtk.visualization import plot
plot(s_train, anomaly_true=known_anomalies)
plt.plot(Y)

from adtk.detector import SeasonalAD
seasonal_ad = SeasonalAD()
anomalies = seasonal_ad.fit_detect(s_train)
plot(s_train, anomaly_pred=anomalies, ap_color='red', ap_marker_on_curve=True)

from adtk.detector import LevelShiftAD
levelshift_ad = LevelShiftAD()
anomalies = levelshift_ad.fit_detect(s_train)
plot(s_train, anomaly_pred=anomalies, ap_color='red', ap_marker_on_curve=True)

from adtk.detector import MinClusterDetector
from sklearn.cluster import KMeans
min_cluster_detector = MinClusterDetector(KMeans(n_clusters=3))
#from adtk.detector import GeneralizedESDTestAD
#esd_ad = GeneralizedESDTestAD(alpha=0.3)
#anomalies = esd_ad.fit_detect(traincl)
#anomalies = esd_ad.fit(trainredox)
#q=plot(traincl,title='Generalized Extreme studentized Deviate Test on Redox', anomaly_pred=anomalies, ts_linewidth=2, ts_markersize=3, ap_markersize=5, ap_color='red', ap_marker_on_curve=True);
#m=esd_ad.fit_predict(testredox,anomalies)

from adtk.detector import AutoregressionAD
autoregression_ad = AutoregressionAD(c=4.0)
anomalies = autoregression_ad.fit(trainredox)  #Fitting the model
m1 = autoregression_ad.fit_predict(testredox, anomalies)  #predicting the model
anomaliesde = autoregression_ad.fit_detect(testredox)
plot(testredox,
     anomaly_pred=anomaliesde,
     ts_linewidth=1,
     ap_color='red',
     ap_marker_on_curve=True)

from adtk.detector import AutoregressionAD
autoregression_ad1 = AutoregressionAD(c=4.0)
anomalies1 = autoregression_ad.fit(traincl2)  #Fitting the model
m = autoregression_ad1.fit_predict(testcl2,
                                   anomalies1)  ##predicting the model.

anomaliesde = autoregression_ad.fit_detect(testcl2)
plot(testcl2,
     anomaly_pred=anomaliesde,
     ts_linewidth=1,
     ap_color='red',
     ap_marker_on_curve=True)
Exemplo n.º 10
0
def panorama_plot_anomalies(base_name,
                            from_timestamp=None,
                            until_timestamp=None):
    """
    Create a plot of the metric with its anomalies and return the anomalies dict
    and the path and filename

    :param base_name: the name of the metric
    :param from_timestamp: the from timestamp
    :param until_timestamp: the until timestamp
    :type base_name: str
    :type from_timestamp: int
    :type until_timestamp: int
    :return: (anomalies_dict, path and file)
    :rtype:  tuple

    """

    function_str = 'panorama_plot_anomalies'

    logger.info('%s - base_name: %s, from_timestamp: %s, until_timestamp: %s' %
                (function_str, str(base_name), str(from_timestamp),
                 str(until_timestamp)))

    if not until_timestamp:
        until_timestamp = int(time())

    save_to_file = '%s/panorama_anomalies_plot.%s.%s.%s.png' % (
        settings.SKYLINE_TMP_DIR, base_name, str(from_timestamp),
        str(until_timestamp))

    try:
        metric_id = get_metric_id_from_base_name(skyline_app, base_name)
        logger.info('%s - %s with metric id:%s' %
                    (function_str, str(base_name), str(metric_id)))
    except Exception as err:
        logger.error(traceback.format_exc())
        logger.error(
            'error :: %s :: failed to determine metric id for %s - %s' %
            (function_str, base_name, err))
        raise

    try:
        anomalies_dict = get_anomalies(skyline_app,
                                       metric_id,
                                       params={'latest': False})
    except Exception as err:
        logger.error(traceback.format_exc())
        logger.error(
            'error :: %s :: failed to determine anomalies for %s - %s' %
            (function_str, base_name, err))
        raise

    if from_timestamp and anomalies_dict:
        for anomaly_id in list(anomalies_dict.keys()):
            if anomalies_dict[anomaly_id]['anomaly_timestamp'] < from_timestamp:
                del anomalies_dict[anomaly_id]

    if until_timestamp and anomalies_dict:
        for anomaly_id in list(anomalies_dict.keys()):
            if anomalies_dict[anomaly_id][
                    'anomaly_timestamp'] > until_timestamp:
                del anomalies_dict[anomaly_id]

    if os.path.isfile(save_to_file):
        return anomalies_dict, save_to_file

    if not from_timestamp and anomalies_dict:
        first_anomaly_id = list(anomalies_dict.keys())[-1]
        first_anomaly_timestamp = anomalies_dict[first_anomaly_id][
            'anomaly_timestamp']
        from_timestamp = first_anomaly_timestamp - (86400 * 7)
        logger.info(
            '%s :: the from_timestamp was not passed, calculated from the anomalies_dict as %s'
            % (function_str, str(from_timestamp)))
    if not from_timestamp and not anomalies_dict:
        logger.info(
            '%s :: the from_timestamp was not passed and no anomalies found for %s'
            % (function_str, base_name))
        from_timestamp = until_timestamp - (86400 * 7)

    metrics_functions = {}
    metrics_functions[base_name] = {}
    metrics_functions[base_name]['functions'] = None

    try:
        metrics_timeseries = get_metrics_timeseries(skyline_app,
                                                    metrics_functions,
                                                    from_timestamp,
                                                    until_timestamp,
                                                    log=False)
    except Exception as err:
        logger.error(traceback.format_exc())
        logger.error('error :: %s :: get_metrics_timeseries failed - %s' %
                     (function_str, err))
        raise

    try:
        timeseries = metrics_timeseries[base_name]['timeseries']
        # Truncate the first and last timestamp, just in case they are not
        # filled buckets
        timeseries = timeseries[1:-1]
    except Exception as err:
        logger.error(traceback.format_exc())
        logger.error('error :: %s :: failed to get timeseries for %s - %s' %
                     (function_str, base_name, err))
        raise

    unaligned_anomaly_timestamps = []
    for anomaly_id in list(anomalies_dict.keys()):
        unaligned_anomaly_timestamps.append(
            anomalies_dict[anomaly_id]['anomaly_timestamp'])

    # Align anomalies to timeseries resolution
    resolution = determine_data_frequency(skyline_app, timeseries, False)
    anomaly_timestamps = []
    for ts in unaligned_anomaly_timestamps:
        anomaly_timestamps.append(int(int(ts) // resolution * resolution))

    try:
        df = pd.DataFrame(timeseries, columns=['date', 'value'])
        df['date'] = pd.to_datetime(df['date'], unit='s')
        datetime_index = pd.DatetimeIndex(df['date'].values)
        df = df.set_index(datetime_index)
        df.drop('date', axis=1, inplace=True)
        anomalies_data = []
        for item in timeseries:
            if int(item[0]) in anomaly_timestamps:
                anomalies_data.append(1)
            else:
                anomalies_data.append(0)
        df['anomalies'] = anomalies_data
        title = '%s\n%s anomalies' % (base_name, str(len(anomaly_timestamps)))
        plot(df['value'],
             anomaly=df['anomalies'],
             anomaly_color='red',
             title=title,
             save_to_file=save_to_file)
    except Exception as err:
        logger.error(traceback.format_exc())
        logger.error('error :: %s :: failed to plot anomalies for %s - %s' %
                     (function_str, base_name, err))
        raise

    if not os.path.isfile(save_to_file):
        return anomalies_dict, None

    return anomalies_dict, save_to_file
Exemplo n.º 11
0
    m = esd_ad.fit_predict(
        testdata2,
        anomalies)  #predicting on the test redox and Cl_2 with pca algorithm
    pca_ad = PcaAD(k=2)
    anomalies1 = pca_ad.fit(
        traindata1)  #training redox and PH with pca algorithm
    m1 = pca_ad.fit_predict(
        testdata1,
        anomalies1)  #predicting on the test redox and pH with PCA algorithm
    f = pd.merge(
        pd.DataFrame(m),
        pd.DataFrame(m1),
        left_index=True,
        right_index=True,
        how='outer'
    )  #Merging the anomalies in both the algorithm in same data frame
    k = OrAggregator().aggregate(f)  #ORing the result of both
    allk = pd.concat([allk, k])

ConfusionMatrix = confusion_matrix(
    df2['EVENT'].astype(bool)[4 * 1440:(index + 5 * 1440)], allk.astype(bool))
print(ConfusionMatrix)
target_names = ['False', 'True']
print(
    classification_report(df2['EVENT'].astype(bool)[4 * 1440:(index +
                                                              5 * 1440)],
                          allk.astype(bool),
                          target_names=target_names))
plot(allk, title='Aggregated Anomalies plot', ts_color='red', ts_linewidth='3')
plot(df2['EVENT'], title='Events plot', ts_color='green', ts_linewidth='3')
Exemplo n.º 12
0
def get_cloudburst_plot(cloudburst_id, base_name, shift, all_in_period=False):
    """
    Create a plot of the cloudburst and return the path and filename

    :param cloudburst_id: the cloudburt id
    :param base_name: the name of the metric
    :param shift: the number of indice to shift the plot
    :type cloudburst_id: int
    :type base_name: str
    :type shift: int
    :return: path and file
    :rtype:  str

    """

    function_str = 'get_cloudburst_plot'

    logger.info(
        'get_cloudburst_plot - cloudburst_id: %s, base_name: %s' % (
            str(cloudburst_id), str(base_name)))

    save_to_file = '%s/cloudburst_id.%s.%s.shift.%s.png' % (
        settings.SKYLINE_TMP_DIR, str(cloudburst_id), base_name, str(shift))
    if all_in_period:
        save_to_file = '%s/cloudburst_id.%s.all.%s.shift.%s.png' % (
            settings.SKYLINE_TMP_DIR, str(cloudburst_id), base_name, str(shift))

    cloudburst_dict = {}
    try:
        cloudburst_dict = get_cloudburst_row(skyline_app, cloudburst_id)
    except Exception as err:
        logger.error(traceback.format_exc())
        logger.error('error :: %s :: get_cloudburst_row failed - %s' % (
            function_str, err))
        raise

    if not cloudburst_dict:
        logger.error('error :: %s :: no cloudburst_dict - %s' % function_str)
        return None, None

    if os.path.isfile(save_to_file):
        return cloudburst_dict, save_to_file

    try:
        from_timestamp = cloudburst_dict['from_timestamp']
        until_timestamp = from_timestamp + cloudburst_dict['full_duration']
        resolution = cloudburst_dict['resolution']
    except Exception as err:
        logger.error(traceback.format_exc())
        logger.error('error :: %s :: failed parse values from cloudburst_dict - %s' % (
            function_str, err))
        raise

    metrics_functions = {}
    metrics_functions[base_name] = {}
    metrics_functions[base_name]['functions'] = None

    if resolution > 60:
        resolution_minutes = int(resolution / 60)
        summarize_intervalString = '%smin' % str(resolution_minutes)
        summarize_func = 'median'
        metrics_functions[base_name]['functions'] = {'summarize': {'intervalString': summarize_intervalString, 'func': summarize_func}}

    try:
        metrics_timeseries = get_metrics_timeseries(skyline_app, metrics_functions, from_timestamp, until_timestamp, log=False)
    except Exception as err:
        logger.error(traceback.format_exc())
        logger.error('error :: %s :: get_metrics_timeseries failed - %s' % (
            function_str, err))
        raise

    try:
        timeseries = metrics_timeseries[base_name]['timeseries']
        timeseries_length = len(timeseries)
        timeseries = timeseries[1:(timeseries_length - 2)]
    except Exception as err:
        logger.error(traceback.format_exc())
        logger.error('error :: %s :: failed to determine timeseries - %s' % (
            function_str, err))
        raise

    anomalies_in_period = []
    if all_in_period:
        try:
            engine, fail_msg, trace = get_engine(skyline_app)
        except Exception as err:
            trace = traceback.format_exc()
            logger.error(trace)
            fail_msg = 'error :: %s :: could not get a MySQL engine - %s' % (function_str, err)
            logger.error('%s' % fail_msg)
            if engine:
                engine_disposal(skyline_app, engine)
            raise
        try:
            cloudburst_table, log_msg, trace = cloudburst_table_meta(skyline_app, engine)
        except Exception as err:
            logger.error(traceback.format_exc())
            logger.error('error :: %s :: failed to get cloudburst_table meta for cloudburst id %s - %s' % (
                function_str, str(cloudburst_id), err))
            if engine:
                engine_disposal(engine)
            raise
        try:
            connection = engine.connect()
            stmt = select([cloudburst_table]).\
                where(cloudburst_table.c.metric_id == cloudburst_dict['metric_id']).\
                where(cloudburst_table.c.timestamp >= from_timestamp).\
                where(cloudburst_table.c.timestamp <= until_timestamp).\
                where(cloudburst_table.c.id != cloudburst_id)
            result = connection.execute(stmt)
            for row in result:
                anomalies_in_period.append([row['timestamp'], row['end']])
            connection.close()
        except Exception as err:
            logger.error(traceback.format_exc())
            logger.error('error :: %s :: could not get cloudburst row for cloudburst id %s - %s' % (
                function_str, str(cloudburst_id), err))
            if engine:
                engine_disposal(engine)
            raise
        if engine:
            engine_disposal(skyline_app, engine)

    anomalies = []
    if anomalies_in_period:
        logger.info(
            'get_cloudburst_plot - adding %s all_in_period anomalies to cloudburst plot' % (
                str(len(anomalies_in_period))))
        for period_anomalies in anomalies_in_period:
            new_anomalies = [item for item in timeseries if int(item[0]) >= period_anomalies[0] and int(item[0]) <= period_anomalies[1]]
            if new_anomalies:
                anomalies = anomalies + new_anomalies
    try:
        cloudburst_anomalies = [item for item in timeseries if int(item[0]) >= cloudburst_dict['timestamp'] and int(item[0]) <= cloudburst_dict['end']]
        anomalies = anomalies + cloudburst_anomalies
        df = pd.DataFrame(timeseries, columns=['date', 'value'])
        df['date'] = pd.to_datetime(df['date'], unit='s')
        datetime_index = pd.DatetimeIndex(df['date'].values)
        df = df.set_index(datetime_index)
        df.drop('date', axis=1, inplace=True)
        anomalies_data = []
        # @modified 20210831
        # Align periods
        # anomaly_timestamps = [int(item[0]) for item in anomalies]
        # anomaly_timestamps = [(int(item[0]) + (resolution * 2)) for item in anomalies]
        # anomaly_timestamps = [(int(item[0]) + (resolution * 6)) for item in anomalies]
        # anomaly_timestamps = [(int(item[0]) + (resolution * 4)) for item in anomalies]
        # anomaly_timestamps = [(int(item[0]) + (resolution * 3)) for item in anomalies]
        anomaly_timestamps = [(int(item[0]) + (resolution * shift)) for item in anomalies]
        for item in timeseries:
            if int(item[0]) in anomaly_timestamps:
                anomalies_data.append(1)
            else:
                anomalies_data.append(0)
        df['anomalies'] = anomalies_data
        title = '%s\ncloudburst id: %s' % (base_name, str(cloudburst_id))
        if all_in_period:
            title = '%s (all in period)' % title
        plot(df['value'], anomaly=df['anomalies'], anomaly_color='red', title=title, save_to_file=save_to_file)
    except Exception as err:
        logger.error(traceback.format_exc())
        logger.error('error :: %s :: failed to plot cloudburst - %s' % (
            function_str, err))
        raise

    if not os.path.isfile(save_to_file):
        return cloudburst_dict, None

    return cloudburst_dict, save_to_file
Exemplo n.º 13
0
df7 = df1[['Scaled Cl_2']]
#df8=df1[['Scaled Leit']]
c = df1['EVENT']

#from adtk.detector import PcaAD
#pca_ad = PcaAD(k=1)
#anomalies= pca_ad.fit_detect(df2)
#p=plot(df2, anomaly_pred=anomalies, ts_linewidth=2, ts_markersize=3, ap_color='red', ap_alpha=0.3, curve_group='all');

from adtk.detector import GeneralizedESDTestAD
esd_ad = GeneralizedESDTestAD(alpha=0.3)
anomalies = esd_ad.fit_detect(df2)
q = plot(df2,
         anomaly_pred=anomalies,
         ts_linewidth=2,
         ts_markersize=3,
         ap_markersize=5,
         ap_color='red',
         ap_marker_on_curve=True)

from adtk.detector import GeneralizedESDTestAD
esd_ad = GeneralizedESDTestAD(alpha=0.3)
anomalies1 = esd_ad.fit_detect(df7)
q = plot(df7,
         anomaly_pred=anomalies1,
         ts_linewidth=2,
         ts_markersize=3,
         ap_markersize=5,
         ap_color='red',
         ap_marker_on_curve=True)
Exemplo n.º 14
0
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


data_range = pd.date_range(pd.to_datetime("2020/7/11 10:30:00"), end= pd.to_datetime("2020/7/11 18:00:00"), freq="30T")
x = np.linspace(0, len(data_range), len(data_range))
data = pd.DataFrame({"time": data_range, "value": x})
data.set_index(["time"], inplace=True)
data.iloc[3:4] = 7
data.iloc[10:11] = 3

from adtk.transformer import DoubleRollingAggregate, RollingAggregate
from adtk.detector import ThresholdAD
from adtk.pipe import Pipenet

# 构建异常检测流水线, DoubleRollingAggregate两个滑动窗口并排移动,没有交叉,步长为1,初始的时候左窗口的右边界在数组外,右窗口的左边界在第一个元素
step = {"abs_skipe_change": {"model": DoubleRollingAggregate(agg="mean", window=(1, 1), center=False, diff="l1"),
                            "input": "original"},
        "positive_change": {"model": ThresholdAD(low=0, high=4),
                            "input": "abs_skipe_change"}
}
mypipenet = Pipenet(steps=step)

anomalies = mypipenet.fit_detect(data, return_list=True, return_intermediate=True)
print(anomalies)
from adtk.visualization import plot
plot(data, anomaly=anomalies, anomaly_color='red', ts_markersize=10, anomaly_markersize=15, ts_linewidth=3, anomaly_alpha=1)
plt.show()
Exemplo n.º 15
0
    TimeBins = validate_series(TimeBins)

    #persist_ad = PersistAD(window=7, c=3, side='both')
    #anomalies1 = persist_ad.fit_detect(TimeBins)
    #plot(TimeBins, anomaly=anomalies1, ts_linewidth=1, ts_markersize=3, anomaly_color='red', figsize=(20,10), anomaly_tag="marker", anomaly_markersize=5)

    #customized_detector = CustomizedDetectorHD(detect_func=Detector_prive)
    #anomalies = customized_detector.detect(TimeBins)

    #threshold_ad = ThresholdAD(high=150, low=0)
    #anomalies = threshold_ad.detect(TimeBins)
    #plot(TimeBins, anomaly=anomalies, ts_linewidth=1, ts_markersize=5, anomaly_color='red', anomaly_alpha=0.3, curve_group='all');

    outlier_detector = OutlierDetector(
        LocalOutlierFactor(n_neighbors=1, p=1, contamination=0.05))
    anomalies = outlier_detector.fit_detect(TimeBins)

    plot(TimeBins,
         anomaly=anomalies,
         ts_linewidth=1,
         ts_markersize=5,
         anomaly_color='red',
         anomaly_alpha=0.3,
         curve_group='all')
    plt.ylim(top=460)
    plt.savefig('%d_%d.pdf' % Input2 % elem, bbox_inches='tight')
    plt.close()
    del TimeBins
    del rslt_df
    del boolean_condition
Exemplo n.º 16
0
def plot_anomalies(current_skyline_app, metric, timeseries, anomalies, title,
                   output_file):
    """
    Create a plot of a timeseries with anomalies and return the path and filename

    :param current_skyline_app: skyline_app
    :param metric: the name of the metric
    :param timeseries: the timeseries to plot
    :param anomalies: the anomaly timestamps
    :param title: the plot title
    :param output_file: the full path and filename (including .png extension) to
        save to plot as
    :type current_skyline_app: str
    :type metric: str
    :type timeseries: list
    :type anomalies: list
    :type title: str
    :type output_file: str
    :return: output_file
    :rtype:  str

    """

    function_str = 'plot_anomalies'
    current_skyline_app_logger = current_skyline_app + 'Log'
    current_logger = logging.getLogger(current_skyline_app_logger)

    if os.path.isfile(output_file):
        current_logger.info('%s :: %s :: plot of %s with %s anomalies exists' %
                            (str(current_skyline_app), function_str, metric,
                             str(len(anomalies))))
        return output_file

    current_logger.info(
        '%s :: %s :: plotting %s with %s anomalies' %
        (str(current_skyline_app), function_str, metric, str(len(anomalies))))

    anomalies_data = []
    last_timestamp = None
    for item in timeseries:
        anomaly_in_period = 0
        if not last_timestamp:
            last_timestamp = int(item[0])
            anomalies_data.append(anomaly_in_period)
            continue
        for anomaly_ts in anomalies:
            if anomaly_ts < last_timestamp:
                continue
            if anomaly_ts > item[0]:
                continue
            if anomaly_ts in list(range(last_timestamp, int(item[0]))):
                anomaly_in_period = 1
                break
        anomalies_data.append(anomaly_in_period)
        last_timestamp = int(item[0])

    try:
        df = pd.DataFrame(timeseries, columns=['date', 'value'])
        df['date'] = pd.to_datetime(df['date'], unit='s')
        datetime_index = pd.DatetimeIndex(df['date'].values)
        df = df.set_index(datetime_index)
        df.drop('date', axis=1, inplace=True)
        df['anomalies'] = anomalies_data
        plot(df['value'],
             anomaly=df['anomalies'],
             anomaly_color='red',
             title=title,
             save_to_file=output_file)
    except Exception as err:
        current_logger.error(traceback.format_exc())
        current_logger.error('error :: %s :: failed to plot anomalies - %s' %
                             (function_str, err))

    if not os.path.isfile(output_file):
        current_logger.error(
            'error :: %s :: %s :: plotting %s with %s anomalies failed not output_file exists'
            % (str(current_skyline_app), function_str, metric,
               str(len(anomalies))))
        return None

    current_logger.info('%s :: %s :: plotted %s with %s anomalies to %s' %
                        (str(current_skyline_app), function_str, metric,
                         str(len(anomalies)), output_file))

    return output_file
Exemplo n.º 17
0
                    if anomalies:
                        anomalies_data = []
                        anomalies_timestamps = [int(item[0]) for item in anomalies]
                        for item in timeseries:
                            if int(item[0]) in anomalies_timestamps:
                                anomalies_data.append(1)
                            else:
                                anomalies_data.append(0)
                        rolling_df['anomalies'] = anomalies_data
                        m66_candidate_metrics[base_name] = {}
                        m66_candidate_metrics[base_name][custom_algorithm] = {}
                        m66_candidate_metrics[base_name][custom_algorithm]['anomalies'] = anomalies
                        # rolling_df['value'].plot(figsize=(18, 6), title=base_name)
                        title = '%s - median 6 6-sigma persisted' % base_name
                        # rolling_df['std_median_6_6sigma'].plot(figsize=(18, 6), title=title)
                        plot(original_rolling_df['value'], anomaly=rolling_df['anomalies'], anomaly_color='red', title=title)
            except Exception as e:
                print('error: %s' % e)
        timer_end = timer()
        print('median_6_6sigma analysis of %s metrics took %.6f seconds, significant changes now detected on %s metrics' % (
            str(len(current_base_names)), (timer_end - timer_start), str(len(m66_candidate_metrics))))
    timer_end_all = timer()
    print('%s metrics analysed with m66, took %.6f seconds - %s metrics found with significant changes' % (
        str(len(metrics)), (timer_end_all - timer_start_all), str(len(m66_candidate_metrics))))

    # Try m66 on the 3 months
    from_timestamp = 1618660800 - (86400 * 7)
    until_timestamp = from_timestamp + (((86400 * 7) * 4) * 3)
    metrics_to_do = list(metrics)
    more_analysis_metrics_timeseries = {}
    timer_start_all = timer()
Exemplo n.º 18
0
def identify_cloudbursts(current_skyline_app, plot_graphs=False, log=False):
    """
    Find significant changes (cloudbursts) in metrics.
    """

    current_skyline_app_logger = current_skyline_app + 'Log'
    current_logger = logging.getLogger(current_skyline_app_logger)

    child_process_pid = os.getpid()
    function_str = '%s :: functions.luminosity.identify_cloudbursts' % current_skyline_app
    if log:
        current_logger.info('%s :: running for process_pid - %s for %s' % (
            function_str, str(child_process_pid), metric))

    start = timer()

    full_uniques = '%sunique_metrics' % settings.FULL_NAMESPACE
    unique_metrics = list(redis_conn_decoded.smembers(full_uniques))

    timer_start_all = timer()
    custom_algorithm = 'm66'
    m66_algorithm_source = '%%s/custom_algorithms/m66.py' % root_path
    custom_algorithms = {}
    custom_algorithms[custom_algorithm] = {
        'algorithm_source': m66_algorithm_source,
        'algorithm_parameters': {
            'nth_median': 6, 'sigma': 6, 'window': 5, 'return_anomalies': True,
            'save_plots_to': False, 'save_plots_to_absolute_dir': False,
            'filename_prefix': False
        },
        'max_execution_time': 1.0,
        'consensus': 1,
        'algorithms_allowed_in_consensus': ['m66'],
        'run_3sigma_algorithms': False,
        'run_before_3sigma': False,
        'run_only_if_consensus': False,
        'use_with': ['crucible', 'luminosity'],
        'debug_logging': False,

    }

    m66_candidate_metrics = {}

    align = True
    truncate_last_datapoint = True
    window = 4
    summarize_intervalString = '15min'
    summarize_func = 'median'
    nth_median = 6
    n_sigma = 6
    custom_algorithm = 'median_6_6sigma'
    m66_candidate_metrics = {}
    found = 0
    now_timestamp = int(time())
    check_last = 3600

    candidate_metrics = {}

    for metric in unique_metrics:
        metric_name = metric
        if metric_name.startswith(settings.FULL_NAMESPACE):
            base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1)
        else:
            base_name = metric_name
        timeseries = []
        timeseries = get_metric_timeseries(skyline_app, metric, False)
        if not timeseries:
            continue
        if truncate_last_datapoint:
            timeseries_length = len(timeseries)
            timeseries = timeseries[1:(timeseries_length - 2)]

                for custom_algorithm in list(custom_algorithms.keys()):
                    custom_algorithms_dict = custom_algorithms[custom_algorithm]
                    custom_algorithm_dict = {}
                    custom_algorithm_dict['debug_logging'] = False
                    debug_algorithm_logging = False
                    if debug_algorithms:
                        custom_algorithm_dict['debug_logging'] = True
                        debug_algorithm_logging = True
                    algorithm_source = '/opt/skyline/github/skyline/skyline/custom_algorithms/%s.py' % algorithm
                    custom_algorithm_dict['algorithm_source'] = algorithm_source
                    if LUMINOSITY_CLASSIFY_ANOMALIES_SAVE_PLOTS:
                        custom_algorithm_dict['algorithm_parameters'] = {
                            'window': window, 'c': 6.0, 'return_anomalies': True,
                            'realtime_analysis': False,
                            'save_plots_to': metric_training_data_dir,
                            'save_plots_to_absolute_dir': True,
                            'filename_prefix': 'luminosity.classify_anomaly',
                            'debug_logging': debug_algorithm_logging,
                        }
                        custom_algorithm_dict['max_execution_time'] = 10.0
                    else:
                        custom_algorithm_dict['algorithm_parameters'] = {
                            'window': window, 'c': 6.0, 'return_anomalies': True,
                            'realtime_analysis': False,
                            'debug_logging': debug_algorithm_logging,
                        }
                        custom_algorithm_dict['max_execution_time'] = 5.0

                    if algorithm == base_algorithm:

                    if current_skyline_app == 'webapp':

                        anomalous, anomalyScore, anomalies, anomalies_dict = run_custom_algorithm_on_timeseries(current_skyline_app, current_pid, base_name, timeseries, custom_algorithm, custom_algorithm_dict, debug_algorithms)


                        result, anomalyScore, anomalies = run_custom_algorithm_on_timeseries(skyline_app, current_pid, base_name, timeseries, custom_algorithm, custom_algorithm_dict, debug_algorithms)

    if return_anomalies:
        return (anomalous, anomalyScore, anomalies)
    else:
        return (anomalous, anomalyScore)

                    else:


                rolling_df = pd.DataFrame(timeseries, columns=['date', 'value'])
                rolling_df['date'] = pd.to_datetime(rolling_df['date'], unit='s')
                datetime_index = pd.DatetimeIndex(rolling_df['date'].values)
                rolling_df = rolling_df.set_index(datetime_index)
                rolling_df.drop('date', axis=1, inplace=True)
                original_rolling_df = rolling_df.copy()
                # MinMax scale
                rolling_df = (rolling_df - rolling_df.min()) / (rolling_df.max() - rolling_df.min())
                window = 6
                data = rolling_df['value'].tolist()
                s = pd.Series(data)
                rolling_median_s = s.rolling(window).median()
                median = rolling_median_s.tolist()
                data = median
                s = pd.Series(data)
                rolling_median_s = s.rolling(window).median()
                median_2 = rolling_median_s.tolist()
                data = median_2
                s = pd.Series(data)
                rolling_median_s = s.rolling(window).median()
                median_3 = rolling_median_s.tolist()
                data = median_3
                s = pd.Series(data)
                rolling_median_s = s.rolling(window).median()
                median_4 = rolling_median_s.tolist()
                data = median_4
                s = pd.Series(data)
                rolling_median_s = s.rolling(window).median()
                median_5 = rolling_median_s.tolist()
                data = median_5
                s = pd.Series(data)
                rolling_median_s = s.rolling(window).median()
                median_6 = rolling_median_s.tolist()
                data = median_6
                s = pd.Series(data)
                rolling_std_s = s.rolling(window).std()
                rolling_df['std_median_6'] = rolling_std_s.tolist()
                std_median_6 = rolling_df['std_median_6'].fillna(0).tolist()
                metric_stddev = np.std(std_median_6)
                std_median_6_6sigma = []
                anomalies = False
                for value in std_median_6:
                    if value > (metric_stddev * 6):
                        std_median_6_6sigma.append(1)
                        anomalies = True
                    else:
                        std_median_6_6sigma.append(0)
                rolling_df['std_median_6_6sigma'] = std_median_6_6sigma

                if anomalies:
                    last_trigger = None
                    current_triggers = []
                    anomalies = []
                    # Only tag anomalous if the 6sigma triggers for window
                    for index, item in enumerate(timeseries):
                        if std_median_6_6sigma[index] == 1:
                            current_triggers.append(index)
                        else:
                            if len(current_triggers) > (window / 2):
                                for trigger_index in current_triggers:
                                    anomalies.append(timeseries[(trigger_index - (window * 3))])
                            current_triggers = []
                    if anomalies:
                        anomalies_data = []
                        anomalies_timestamps = [int(item[0]) for item in anomalies]
                        for item in timeseries:
                            if int(item[0]) in anomalies_timestamps:
                                anomalies_data.append(1)
                            else:
                                anomalies_data.append(0)
                        rolling_df['anomalies'] = anomalies_data
                        m66_candidate_metrics[base_name] = {}
                        m66_candidate_metrics[base_name][custom_algorithm] = {}
                        m66_candidate_metrics[base_name][custom_algorithm]['anomalies'] = anomalies
                        # rolling_df['value'].plot(figsize=(18, 6), title=base_name)
                        title = '%s - median 6 6-sigma persisted' % base_name
                        # rolling_df['std_median_6_6sigma'].plot(figsize=(18, 6), title=title)
                        plot(original_rolling_df['value'], anomaly=rolling_df['anomalies'], anomaly_color='red', title=title)
Exemplo n.º 19
0
df7 = df1[['Scaled Cl_2']]
#df8=df1[['Scaled Leit']]
c = df1['EVENT']

#from adtk.detector import PcaAD
#pca_ad = PcaAD(k=1)
#anomalies= pca_ad.fit_detect(df2)
#p=plot(df2, anomaly_pred=anomalies, ts_linewidth=2, ts_markersize=3, ap_color='red', ap_alpha=0.3, curve_group='all');

from adtk.detector import GeneralizedESDTestAD
esd_ad = GeneralizedESDTestAD(alpha=0.3)
anomalies = esd_ad.fit_detect(df2)
q = plot(df2,
         title='Generalized Extreme studentized Deviate Test on Redox',
         anomaly_pred=anomalies,
         ts_linewidth=2,
         ts_markersize=3,
         ap_markersize=5,
         ap_color='red',
         ap_marker_on_curve=True)

from adtk.detector import GeneralizedESDTestAD
esd_ad = GeneralizedESDTestAD(alpha=0.3)
anomalies1 = esd_ad.fit_detect(df7)
q = plot(df7,
         title='Generalized Extreme studentized Deviate Test on Cl_2',
         anomaly_pred=anomalies1,
         ts_linewidth=3,
         ts_markersize=3,
         ap_markersize=5,
         ap_color='red',
         ap_marker_on_curve=True)
Exemplo n.º 20
0
def adtk_level_shift(current_skyline_app, parent_pid, timeseries, algorithm_parameters):
    """
    A timeseries is anomalous if a level shift occurs in a 5 window period bound
    by a factor of 9 of the normal range based on historical interquartile range.

    :param current_skyline_app: the Skyline app executing the algorithm.  This
        will be passed to the algorithm by Skyline.  This is **required** for
        error handling and logging.  You do not have to worry about handling the
        argument in the scope of the custom algorithm itself,  but the algorithm
        must accept it as the first agrument.
    :param parent_pid: the parent pid which is executing the algorithm, this is
        **required** for error handling and logging.  You do not have to worry
        about handling this argument in the scope of algorithm, but the
        algorithm must accept it as the second argument.
    :param timeseries: the time series as a list e.g. ``[[1578916800.0, 29.0],
        [1578920400.0, 55.0], ... [1580353200.0, 55.0]]``
    :param algorithm_parameters: a dictionary of any required parameters for the
        custom_algorithm and algorithm itself.  For the matrixprofile custom
        algorithm the following parameters are required, example:
        ``algorithm_parameters={
            'c': 9.0,
            'run_every': 5,
            'side': 'both',
            'window': 5
        }``
    :type current_skyline_app: str
    :type parent_pid: int
    :type timeseries: list
    :type algorithm_parameters: dict
    :return: True, False or Non
    :rtype: boolean

    Performance is of paramount importance in Skyline, especially in terms of
    computational complexity, along with execution time and CPU usage. The
    adtk LevelShiftAD algortihm is not O(n) and it is not fast either, not when
    compared to the normal three-sigma triggered algorithms.  However it is
    useful if you care about detecting all level shifts.  The normal three-sigma
    triggered algorithms do not always detect a level shift, especially if the
    level shift does not breach the three-sigma limits.  Therefore you may find
    over time that you encounter alerts that contain level shifts that you
    thought should have been detected.  On these types of metrics and events,
    the adtk LevelShiftAD algortihm can be implemented to detect and alert on
    these.  It is not recommended to run on all your metrics as it would
    immediately triple the analyzer runtime every if only run every 5 windows/
    minutes.

    Due to the computational complexity and long run time of the adtk
    LevelShiftAD algorithm on the size of timeseries data used by Skyline, if
    you consider the following timings of all three-sigma triggered algorithms
    and compare them to the to the adtk_level_shift results in the last 2 rows
    of the below log, it is clear that the running adtk_level_shift on all
    metrics is probably not desirable, even if it is possible to do, it is very
    noisy.

    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - histogram_bins run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - histogram_bins has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - histogram_bins - total: 1.051136 - median: 0.001430
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - first_hour_average run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - first_hour_average has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - first_hour_average - total: 1.322432 - median: 0.001835
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - stddev_from_average run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - stddev_from_average has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - stddev_from_average - total: 1.097290 - median: 0.001641
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - grubbs run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - grubbs has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - grubbs - total: 1.742929 - median: 0.002438
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - ks_test run 147 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - ks_test has 147 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - ks_test - total: 0.127648 - median: 0.000529
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - mean_subtraction_cumulation run 40 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - mean_subtraction_cumulation has 40 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - mean_subtraction_cumulation - total: 0.152515 - median: 0.003152
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - median_absolute_deviation run 35 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - median_absolute_deviation has 35 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - median_absolute_deviation - total: 0.143770 - median: 0.003248
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - stddev_from_moving_average run 30 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - stddev_from_moving_average has 30 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - stddev_from_moving_average - total: 0.125173 - median: 0.003092
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - least_squares run 16 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - least_squares has 16 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - least_squares - total: 0.089108 - median: 0.005538
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - abs_stddev_from_median run 1 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - abs_stddev_from_median has 1 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - abs_stddev_from_median - total: 0.036797 - median: 0.036797
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - adtk_level_shift run 271 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - adtk_level_shift has 271 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - adtk_level_shift - total: 13.729565 - median: 0.035791
    ...
    ...
    2021-03-06 10:46:39 :: 1582754 :: seconds to run     :: 27.93  # THE TOTAL ANALYZER RUNTIME

    Therefore the analysis methodology implemented for the adtk_level_shift
    custom_algorithm is as folows:

    - When new metrics are added either to the configuration or by actual new
    metrics coming online that match the ``algorithm_parameters['namespace']``,
    Skyline implements sharding on new metrics into time slots to prevent a
    thundering herd situation from developing.  A newly added metrics will
    eventually be assigned into a time shard and be added and the last analysed
    timestamp will be added to the ``analyzer.last.adtk_level_shift`` Redis hash
    key to determine the next scheduled run with
    ``algorithm_parameters['namespace']``

    - A ``run_every`` parameter is implemented so that the algorithm can be
    configured to run on a metric once every ``run_every`` minutes.  The default
    is to run it every 5 minutes using window 5 (rolling) and trigger as
    anomalous if the algorithm labels any of the last 5 datapoints as anomalous.
    This means that there could be up to a 5 minute delay on an alert on the
    60 second, 168 SECOND_ORDER_RESOLUTION_HOURS metrics in the example, but a
    ``c=9.0`` level shift would be detected and would be alerted on (if both
    analyzer and mirage triggered on it).  This periodic running of the
    algorithm is a tradeoff so that the adtk_level_shift load and runtime can be
    spread over ``run_every`` minutes.

    - The algorithm is not run against metrics that are sparsely populated.
    When the algorithm is run on sparsely populated metrics it results in lots
    of false positives and noise.

    The Skyline CUSTOM_ALGORITHMS implementation of the adtk LevelShiftAD
    algorithm is configured as the example shown below.  However please note
    that the algorithm_parameters shown in this example configuration are
    suitiable for metrics that have a 60 second relation and have a
    :mod:`settings.ALERTS` Mirage SECOND_ORDER_RESOLUTION_HOURS of 168 (7 days).
    For metrics with a different resolution/frequency may require different
    values appropriate for metric resolution.

    :
    Example CUSTOM_ALGORITHMS configuration:

    'adtk_level_shift': {
        'namespaces': [
            'skyline.analyzer.run_time', 'skyline.analyzer.total_metrics',
            'skyline.analyzer.exceptions'
        ],
        'algorithm_source': '/opt/skyline/github/skyline/skyline/custom_algorithms/adtk_level_shift.py',
        'algorithm_parameters': {'c': 9.0, 'run_every': 5, 'side': 'both', 'window': 5},
        'max_execution_time': 0.5,
        'consensus': 1,
        'algorithms_allowed_in_consensus': ['adtk_level_shift'],
        'run_3sigma_algorithms': True,
        'run_before_3sigma': True,
        'run_only_if_consensus': False,
        'use_with': ["analyzer", "mirage"],
        'debug_logging': False,
    },

    """

    # You MUST define the algorithm_name
    algorithm_name = 'adtk_level_shift'

    # Define the default state of None and None, anomalous does not default to
    # False as that is not correct, False is only correct if the algorithm
    # determines the data point is not anomalous.  The same is true for the
    # anomalyScore.
    anomalous = None
    anomalyScore = None

    # @aded 20210308 - Feature #3978: luminosity - classify_metrics
    #                  Feature #3642: Anomaly type classification
    return_anomalies = False
    anomalies = []
    realtime_analysis = True

    current_logger = None

    # If you wanted to log, you can but this should only be done during
    # testing and development
    def get_log(current_skyline_app):
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        return current_logger

    start = timer()

    # Use the algorithm_parameters to determine the sample_period
    debug_logging = None
    try:
        debug_logging = algorithm_parameters['debug_logging']
    except:
        debug_logging = False
    if debug_logging:
        try:
            current_logger = get_log(current_skyline_app)
            current_logger.debug('debug :: %s :: debug_logging enabled with algorithm_parameters - %s' % (
                algorithm_name, str(algorithm_parameters)))
        except:
            # This except pattern MUST be used in ALL custom algortihms to
            # facilitate the traceback from any errors.  The algorithm we want to
            # run super fast and without spamming the log with lots of errors.
            # But we do not want the function returning and not reporting
            # anything to the log, so the pythonic except is used to "sample" any
            # algorithm errors to a tmp file and report once per run rather than
            # spewing tons of errors into the log e.g. analyzer.log
            record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc())
            # Return None and None as the algorithm could not determine True or False
            return (False, None)

    # Allow the LevelShiftAD window parameter to be passed in the
    # algorithm_parameters
    window = 5
    try:
        window = algorithm_parameters['window']
    except:
        pass

    # Allow the LevelShiftAD c parameter to be passed in the
    # algorithm_parameters
    c = 9.0
    try:
        c = algorithm_parameters['c']
    except:
        pass

    run_every = window
    try:
        run_every = algorithm_parameters['run_every']
    except:
        pass

    side = 'both'
    try:
        side = algorithm_parameters['side']
    except:
        pass

    if debug_logging:
        current_logger.debug('debug :: algorithm_parameters :: %s' % (
            str(algorithm_parameters)))

    # @added 20210308 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    try:
        return_anomalies = algorithm_parameters['return_anomalies']
    except:
        return_anomalies = False
    try:
        realtime_analysis = algorithm_parameters['realtime_analysis']
    except:
        realtime_analysis = True

    # @added 20210316 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    save_plots_to = False
    try:
        save_plots_to = algorithm_parameters['save_plots_to']
    except:
        pass

    # @added 20210323 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    save_plots_to_absolute_dir = False
    try:
        save_plots_to_absolute_dir = algorithm_parameters['save_plots_to_absolute_dir']
    except:
        pass
    filename_prefix = False
    try:
        filename_prefix = algorithm_parameters['filename_prefix']
    except:
        pass

    # @added 20210318 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    run_PersistAD = False
    try:
        run_PersistAD = algorithm_parameters['run_PersistAD']
    except:
        pass

    if debug_logging:
        current_logger.debug('debug :: algorithm_parameters :: %s' % (
            str(algorithm_parameters)))

    try:
        base_name = algorithm_parameters['base_name']
    except:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        if return_anomalies:
            return (False, None, anomalies)
        else:
            return (False, None)
    if debug_logging:
        current_logger.debug('debug :: %s :: base_name - %s' % (
            algorithm_name, str(base_name)))

    # Due to the load and runtime of LevelShiftAD it is only run in analyzer
    # periodically
    if current_skyline_app == 'analyzer':
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
        last_hash_key = 'analyzer.last.%s' % algorithm_name
        last_check = None
        try:
            raw_last_check = redis_conn_decoded.hget(last_hash_key, base_name)
            last_check = int(raw_last_check)
        except:
            last_check = None
        last_window_timestamps = [int(item[0]) for item in timeseries[-run_every:]]
        if last_check in last_window_timestamps:
            if debug_logging:
                current_logger.debug('debug :: %s :: run_every period is not over yet, skipping base_name - %s' % (
                    algorithm_name, str(base_name)))
            if return_anomalies:
                return (False, None, anomalies)
            else:
                return (False, None)

        # If there is no last timestamp, shard the metric, it will eventually
        # be added.
        if not last_check:
            now = datetime.datetime.now()
            now_seconds = int(now.second)
            if now_seconds == 0:
                now_seconds = 1
            period_seconds = int(60 / run_every)
            shard = int(period_seconds)
            last_shard = 60
            shard = int(period_seconds)
            shards = [shard]
            while shard < last_shard:
                shard = shard + period_seconds
                shards.append((shard))
            shard_value = round(now_seconds / shards[0]) * shards[0]
            if shard_value <= shards[0]:
                shard_value = shards[0]
            metric_as_bytes = str(base_name).encode()
            value = zlib.adler32(metric_as_bytes)
            shard_index = [(index + 1) for index, s_value in enumerate(shards) if s_value == shard_value][0]
            modulo_result = value % shard_index
            if modulo_result == 0:
                if debug_logging:
                    current_logger.debug('debug :: %s :: skipping as not sharded into this run - %s' % (
                        algorithm_name, str(base_name)))
            if return_anomalies:
                return (False, None, anomalies)
            else:
                return (False, None)
        if debug_logging:
            current_logger.debug('debug :: %s :: analysing %s' % (
                algorithm_name, str(base_name)))

        try:
            int_metric_timestamp = int(timeseries[-1][0])
        except:
            int_metric_timestamp = 0
        if int_metric_timestamp:
            try:
                redis_conn_decoded.hset(
                    last_hash_key, base_name,
                    int_metric_timestamp)
            except:
                pass

    # ALWAYS WRAP YOUR ALGORITHM IN try and the BELOW except
    try:
        start_preprocessing = timer()

        # INFO: Sorting time series of 10079 data points took 0.002215 seconds
        timeseries = sorted(timeseries, key=lambda x: x[0])
        if debug_logging:
            current_logger.debug('debug :: %s :: time series of length - %s' % (
                algorithm_name, str(len(timeseries))))

        # Testing the data to ensure it meets minimum requirements, in the case
        # of Skyline's use of the LevelShiftAD algorithm this means that:
        # - the time series must have at least 75% of its full_duration
        # - the time series must have at least 99% of the data points for the
        #   in the sample being analysed.
        do_not_use_sparse_data = False
        if current_skyline_app == 'analyzer':
            do_not_use_sparse_data = True

        # @added 20210305 - Feature #3970: custom_algorithm - adtk_level_shift
        #                   Task #3664:: POC with adtk
        # With mirage also do not run LevelShiftAD on sparsely populated data
        if current_skyline_app == 'mirage':
            do_not_use_sparse_data = True

        # @aded 20210309 - Feature #3978: luminosity - classify_metrics
        #                  Feature #3642: Anomaly type classification
        if current_skyline_app == 'luminosity':
            do_not_use_sparse_data = True

        if do_not_use_sparse_data:

            total_period = 0
            total_datapoints = 0
            try:
                start_timestamp = int(timeseries[0][0])
                end_timestamp = int(timeseries[-1][0])
                total_period = end_timestamp - start_timestamp
                total_datapoints = len(timeseries)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % (
                        algorithm_name, e))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error('error :: debug_logging :: %s :: failed to determine total_period and total_datapoints' % (
                        algorithm_name))
                timeseries = []
            if not timeseries:
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)

            if current_skyline_app == 'analyzer':
                # Default for analyzer at required period to 18 hours
                period_required = int(FULL_DURATION * 0.75)
            else:
                # Determine from timeseries
                if total_period < FULL_DURATION:
                    period_required = int(FULL_DURATION * 0.75)
                else:
                    period_required = int(total_period * 0.75)

            # If the time series does not have 75% of its full_duration it does not
            # have sufficient data to sample
            try:
                if total_period < period_required:
                    if debug_logging:
                        current_logger.debug('debug :: %s :: time series does not have sufficient data' % (
                            algorithm_name))
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    else:
                        return (anomalous, anomalyScore)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % (
                        algorithm_name, e))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error('error :: debug_logging :: %s :: falied to determine if time series has sufficient data' % (
                        algorithm_name))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)

            # If the time series does not have 75% of its full_duration data points
            # it does not have sufficient data to sample

            # Determine resolution from the last 30 data points
            # INFO took 0.002060 seconds
            resolution_timestamps = []
            metric_resolution = False
            for metric_datapoint in timeseries[-30:]:
                timestamp = int(metric_datapoint[0])
                resolution_timestamps.append(timestamp)
            timestamp_resolutions = []
            if resolution_timestamps:
                last_timestamp = None
                for timestamp in resolution_timestamps:
                    if last_timestamp:
                        resolution = timestamp - last_timestamp
                        timestamp_resolutions.append(resolution)
                        last_timestamp = timestamp
                    else:
                        last_timestamp = timestamp
                try:
                    del resolution_timestamps
                except:
                    pass
            if timestamp_resolutions:
                try:
                    timestamp_resolutions_count = Counter(timestamp_resolutions)
                    ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common()
                    metric_resolution = int(ordered_timestamp_resolutions_count[0][0])
                except SystemExit as e:
                    if debug_logging:
                        current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % (
                            algorithm_name, e))
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    else:
                        return (anomalous, anomalyScore)
                except:
                    traceback_msg = traceback.format_exc()
                    record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                    if debug_logging:
                        current_logger.error(traceback_msg)
                        current_logger.error('error :: debug_logging :: %s :: failed to determine if time series has sufficient data' % (
                            algorithm_name))
                try:
                    del timestamp_resolutions
                except:
                    pass
            minimum_datapoints = None
            if metric_resolution:
                minimum_datapoints = int(period_required / metric_resolution)
            if minimum_datapoints:
                if total_datapoints < minimum_datapoints:
                    if debug_logging:
                        current_logger.debug('debug :: %s :: time series does not have sufficient data, minimum_datapoints required is %s and time series has %s' % (
                            algorithm_name, str(minimum_datapoints),
                            str(total_datapoints)))
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    else:
                        return (anomalous, anomalyScore)

            # Is the time series fully populated?
            # full_duration_datapoints = int(full_duration / metric_resolution)
            total_period_datapoints = int(total_period / metric_resolution)
            # minimum_percentage_sparsity = 95
            minimum_percentage_sparsity = 90
            sparsity = int(total_datapoints / (total_period_datapoints / 100))
            if sparsity < minimum_percentage_sparsity:
                if debug_logging:
                    current_logger.debug('debug :: %s :: time series does not have sufficient data, minimum_percentage_sparsity required is %s and time series has %s' % (
                        algorithm_name, str(minimum_percentage_sparsity),
                        str(sparsity)))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)
            if len(set(item[1] for item in timeseries)) == 1:
                if debug_logging:
                    current_logger.debug('debug :: %s :: time series does not have sufficient variability, all the values are the same' % algorithm_name)
                anomalous = False
                anomalyScore = 0.0
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)

        end_preprocessing = timer()
        preprocessing_runtime = end_preprocessing - start_preprocessing
        if debug_logging:
            current_logger.debug('debug :: %s :: preprocessing took %.6f seconds' % (
                algorithm_name, preprocessing_runtime))

        if not timeseries:
            if debug_logging:
                current_logger.debug('debug :: %s :: LevelShiftAD not run as no data' % (
                    algorithm_name))
            anomalies = []
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)
        else:
            if debug_logging:
                current_logger.debug('debug :: %s :: timeseries length: %s' % (
                    algorithm_name, str(len(timeseries))))

        if len(timeseries) < 100:
            if debug_logging:
                current_logger.debug('debug :: %s :: time series does not have sufficient data' % (
                    algorithm_name))
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)

        start_analysis = timer()
        try:
            df = pd.DataFrame(timeseries, columns=['date', 'value'])
            df['date'] = pd.to_datetime(df['date'], unit='s')
            datetime_index = pd.DatetimeIndex(df['date'].values)
            df = df.set_index(datetime_index)
            df.drop('date', axis=1, inplace=True)
            s = validate_series(df)
            level_shift_ad = LevelShiftAD(c=c, side=side, window=window)
            anomaly_df = level_shift_ad.fit_detect(s)
            anomalies = anomaly_df.loc[anomaly_df['value'] > 0]
            anomalous = False
            if len(anomalies) > 0:
                anomaly_timestamps = list(anomalies.index.astype(np.int64) // 10**9)
                if realtime_analysis:
                    last_window_timestamps = [int(item[0]) for item in timeseries[-window:]]
                    # if timeseries[-1][0] in anomaly_timestamps:
                    for timestamp in last_window_timestamps:
                        if timestamp in anomaly_timestamps:
                            anomalous = True
                            break
                else:
                    anomalous = True
                    # Convert anomalies dataframe to anomalies_list
                    anomalies_list = []

                    # @added 20210316 - Feature #3978: luminosity - classify_metrics
                    #                   Feature #3642: Anomaly type classification
                    # Convert anomalies dataframe to anomalies_dict
                    anomalies_dict = {}
                    anomalies_dict['metric'] = base_name
                    anomalies_dict['timestamp'] = int(timeseries[-1][0])
                    anomalies_dict['from_timestamp'] = int(timeseries[0][0])
                    anomalies_dict['algorithm'] = algorithm_name
                    anomalies_dict['anomalies'] = {}

                    for ts, value in timeseries:
                        if int(ts) in anomaly_timestamps:
                            anomalies_list.append([int(ts), value])
                            anomalies_dict['anomalies'][int(ts)] = value
                    anomalies = list(anomalies_list)

                    # @added 20210316 - Feature #3978: luminosity - classify_metrics
                    #                   Feature #3642: Anomaly type classification
                    if save_plots_to:
                        try:
                            from adtk.visualization import plot
                            metric_dir = base_name.replace('.', '/')
                            timestamp_dir = str(int(timeseries[-1][0]))
                            save_path = '%s/%s/%s/%s' % (
                                save_plots_to, algorithm_name, metric_dir,
                                timestamp_dir)
                            if save_plots_to_absolute_dir:
                                save_path = '%s' % save_plots_to
                            anomalies_dict['file_path'] = save_path
                            save_to_file = '%s/%s.%s.png' % (
                                save_path, algorithm_name, base_name)
                            if filename_prefix:
                                save_to_file = '%s/%s.%s.%s.png' % (
                                    save_path, filename_prefix, algorithm_name,
                                    base_name)
                            save_to_path = os_path_dirname(save_to_file)
                            title = '%s\n%s' % (algorithm_name, base_name)
                            if not os_path_exists(save_to_path):
                                try:
                                    mkdir_p(save_to_path)
                                except Exception as e:
                                    current_logger.error('error :: %s :: failed to create dir - %s - %s' % (
                                        algorithm_name, save_to_path, e))
                            if os_path_exists(save_to_path):
                                try:
                                    plot(s, anomaly=anomaly_df, anomaly_color='red', title=title, save_to_file=save_to_file)
                                    if debug_logging:
                                        current_logger.debug('debug :: %s :: plot saved to - %s' % (
                                            algorithm_name, save_to_file))
                                except Exception as e:
                                    current_logger.error('error :: %s :: failed to plot - %s - %s' % (
                                        algorithm_name, base_name, e))
                            anomalies_file = '%s/%s.%s.anomalies_list.txt' % (
                                save_path, algorithm_name, base_name)
                            with open(anomalies_file, 'w') as fh:
                                fh.write(str(anomalies_list))
                                # os.chmod(anomalies_file, mode=0o644)
                            data_file = '%s/data.txt' % (save_path)
                            with open(data_file, 'w') as fh:
                                fh.write(str(anomalies_dict))
                        except SystemExit as e:
                            if debug_logging:
                                current_logger.debug('debug_logging :: %s :: SystemExit called during save plot, exiting - %s' % (
                                    algorithm_name, e))
                            if return_anomalies:
                                return (anomalous, anomalyScore, anomalies)
                            else:
                                return (anomalous, anomalyScore)
                        except Exception as e:
                            traceback_msg = traceback.format_exc()
                            record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                            if debug_logging:
                                current_logger.error(traceback_msg)
                                current_logger.error('error :: %s :: failed to plot or save anomalies file - %s - %s' % (
                                    algorithm_name, base_name, e))
            else:
                anomalies = []

            # @added 20210318 - Feature #3978: luminosity - classify_metrics
            #                   Feature #3642: Anomaly type classification
            if anomalies and run_PersistAD and not realtime_analysis:
                persist_ad_algorithm_parameters = {}
                try:
                    persist_ad_algorithm_parameters = algorithm_parameters['persist_ad_algorithm_parameters']
                except:
                    pass
                persist_ad_window = 20
                try:
                    persist_ad_window = persist_ad_algorithm_parameters['window']
                except:
                    pass
                persist_ad_c = 9.9
                try:
                    persist_ad_c = persist_ad_algorithm_parameters['c']
                except:
                    pass
                try:
                    from adtk.detector import PersistAD
                    persist_ad = PersistAD(c=persist_ad_c, side='both', window=persist_ad_window)
                    persist_ad_anomaly_df = persist_ad.fit_detect(s)
                    persist_ad_anomalies = persist_ad_anomaly_df.loc[persist_ad_anomaly_df['value'] > 0]
                    if len(persist_ad_anomalies) > 0:
                        current_logger.info('%s :: %s anomalies found with PersistAD on %s' % (
                            algorithm_name, str(len(persist_ad_anomalies)),
                            base_name))
                        persist_ad_anomaly_timestamps = list(persist_ad_anomalies.index.astype(np.int64) // 10**9)
                        # Convert persist_ad_anomalies dataframe to persist_ad_anomalies_list
                        persist_ad_anomalies_list = []
                        persist_ad_anomalies_dict = {}
                        persist_ad_anomalies_dict['metric'] = base_name
                        persist_ad_anomalies_dict['timestamp'] = int(timeseries[-1][0])
                        persist_ad_anomalies_dict['from_timestamp'] = int(timeseries[0][0])
                        persist_ad_anomalies_dict['algorithm'] = 'adtk_PersistAD'
                        persist_ad_anomalies_dict['anomalies'] = {}

                        for ts, value in timeseries:
                            if int(ts) in persist_ad_anomaly_timestamps:
                                persist_ad_anomalies_list.append([int(ts), value])
                                persist_ad_anomalies_dict['anomalies'][int(ts)] = value
                        persist_ad_anomalies = list(persist_ad_anomalies_list)
                        if save_plots_to:
                            try:
                                from adtk.visualization import plot
                                metric_dir = base_name.replace('.', '/')
                                timestamp_dir = str(int(timeseries[-1][0]))
                                save_path = '%s/%s/%s/%s' % (
                                    save_plots_to, algorithm_name, metric_dir,
                                    timestamp_dir)
                                if save_plots_to_absolute_dir:
                                    save_path = '%s' % save_plots_to
                                persist_ad_anomalies_dict['file_path'] = save_path
                                save_to_file = '%s/%s.PersistAD.%s.png' % (
                                    save_path, algorithm_name, base_name)
                                if filename_prefix:
                                    save_to_file = '%s/%s.%s.%s.png' % (
                                        save_path, filename_prefix, algorithm_name,
                                        base_name)
                                save_to_path = os_path_dirname(save_to_file)
                                title = '%s - PersistAD verification\n%s' % (algorithm_name, base_name)
                                if not os_path_exists(save_to_path):
                                    try:
                                        mkdir_p(save_to_path)
                                    except Exception as e:
                                        current_logger.error('error :: %s :: failed to create dir - %s - %s' % (
                                            algorithm_name, save_to_path, e))
                                if os_path_exists(save_to_path):
                                    try:
                                        plot(s, anomaly=persist_ad_anomaly_df, anomaly_color='red', title=title, save_to_file=save_to_file)
                                        if debug_logging:
                                            current_logger.debug('debug :: %s :: plot saved to - %s' % (
                                                algorithm_name, save_to_file))
                                    except Exception as e:
                                        current_logger.error('error :: %s :: failed to plot - %s - %s' % (
                                            algorithm_name, base_name, e))
                                anomalies_file = '%s/%s.%s.PersistAD.anomalies_list.txt' % (
                                    save_path, algorithm_name, base_name)
                                with open(anomalies_file, 'w') as fh:
                                    fh.write(str(persist_ad_anomalies))
                                    # os.chmod(anomalies_file, mode=0o644)
                                data_file = '%s/PersistAD.data.txt' % (save_path)
                                with open(data_file, 'w') as fh:
                                    fh.write(str(persist_ad_anomalies_dict))
                            except Exception as e:
                                traceback_msg = traceback.format_exc()
                                record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                                if debug_logging:
                                    current_logger.error(traceback_msg)
                                    current_logger.error('error :: %s :: failed to plot or save PersistAD anomalies file - %s - %s' % (
                                        algorithm_name, base_name, e))
                except Exception as e:
                    traceback_msg = traceback.format_exc()
                    record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                    if debug_logging:
                        current_logger.error(traceback_msg)
                        current_logger.error('error :: %s :: failed to analysis with PersistAD anomalies file - %s - %s' % (
                            algorithm_name, base_name, e))
            try:
                del df
            except:
                pass
        except SystemExit as e:
            if debug_logging:
                current_logger.debug('debug_logging :: %s :: SystemExit called, during analysis, exiting - %s' % (
                    algorithm_name, e))
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)
        except:
            traceback_msg = traceback.format_exc()
            record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
            if debug_logging:
                current_logger.error(traceback_msg)
                current_logger.error('error :: debug_logging :: %s :: failed to run on ts' % (
                    algorithm_name))
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)

        end_analysis = timer()
        analysis_runtime = end_analysis - start_analysis

        if debug_logging:
            current_logger.debug('debug :: %s :: LevelShiftAD took %.6f seconds' % (
                algorithm_name, analysis_runtime))

        if anomalous:
            anomalyScore = 1.0
        else:
            anomalyScore = 0.0

        if debug_logging:
            current_logger.info('%s :: anomalous - %s, anomalyScore - %s' % (
                algorithm_name, str(anomalous), str(anomalyScore)))

        if debug_logging:
            end = timer()
            processing_runtime = end - start
            current_logger.info('%s :: completed analysis in %.6f seconds' % (
                algorithm_name, processing_runtime))
        try:
            del timeseries
        except:
            pass
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        else:
            return (anomalous, anomalyScore)

    except SystemExit as e:
        if debug_logging:
            current_logger.debug('debug_logging :: %s :: SystemExit called (before StopIteration), exiting - %s' % (
                algorithm_name, e))
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        else:
            return (anomalous, anomalyScore)
    except StopIteration:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        if return_anomalies:
            return (False, None, anomalies)
        else:
            return (False, None)
    except:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        if return_anomalies:
            return (False, None, anomalies)
        else:
            return (False, None)

    if return_anomalies:
        return (anomalous, anomalyScore, anomalies)
    else:
        return (anomalous, anomalyScore)
Exemplo n.º 21
0
import json
import pandas as pd
from adtk.data import validate_series
from adtk.visualization import plot
from adtk.detector import SeasonalAD

with open('data.txt') as json_file:
    data = json.load(json_file)
    print(data)

s_train = pd.read_csv("./training.csv", index_col="Datetime", parse_dates=True, squeeze=True)
s_train = validate_series(s_train)
# print(s_train)
plot(s_train)
seasonal_ad = SeasonalAD()
anomalies = seasonal_ad.fit_detect(s_train)
print(anomalies)
plot(s_train, anomaly_pred=anomalies, ap_color='red', ap_marker_on_curve=True)

# from firebase import Firebase
#
# config = {
#     "apiKey" : "AIzaSyDHWPY4NelJCF-UkuLjcH2WX4njgU5TDVI",
#     "authDomain" : "fireguard-88888.firebaseapp.com",
#     "databaseURL" : "https://fireguard-88888.firebaseio.com",
#     "projectId" : "fireguard-88888",
#     "storageBucket": "fireguard-88888.appspot.com",
#     "messagingSenderId": "434458514176",
#     "appId": "1:434458514176:web:60d16d55a6f382e7e899e5"
# }
#
Exemplo n.º 22
0
# NOT NEEDED AS ADTK HANDLES DATETIME INDEXING
# # data vis
# chart_data = data[['date', 'sessions']]
# chart_data.head

# # Convert df date colum to pd.Datetime and swap out date for datetime index in df2
# datetime_series = pd.to_datetime(chart_data['date'])
# datetime_index = pd.DatetimeIndex(datetime_series.values)
# df2=data.set_index(datetime_index)
# df2.drop('date',axis=1,inplace=True)

# # validate and data vis
# chart_data = df2[['sessions']]
# print(chart_data)

data = pd.read_csv(csv_data, index_col=DATE_COL, parse_dates=True)
s = data['sessions']
s = validate_series(s)

# Threshhold analysis
threshold_ad = ThresholdAD(high=100000, low=60000)
anomalies = threshold_ad.detect(s)

# Visualise threshold AD
plot(s,
     anomaly=anomalies,
     ts_linewidth=1,
     ts_markersize=3,
     anomaly_markersize=5,
     anomaly_color='red',
     anomaly_tag="marker")
#!/usr/bin/env python3

import pandas as pd
from adtk.data import validate_series
from adtk.visualization import plot
from adtk.detector import LevelShiftAD

s_train = pd.read_csv("./Kohl_BB_Data.csv", index_col="time", parse_dates=True, squeeze=True)
s_train = validate_series(s_train)
#print(s_train)

plot(s_train)

level_shift_ad = LevelShiftAD(c=6.0, side='both', window=1) # This is almost matching to TSOutlier
anomalies_1 = level_shift_ad.fit_detect(s_train)


s_test_output = pd.concat([s_train,anomalies_1],axis=1)
print(s_test_output)
plot(s_train, anomaly=anomalies_1, anomaly_color='red');