def run(self, data, max_evals=50):
        """
        This function runs hyperparameter optimization fort LAD batch outlier detection models

        :param list[list] data: Input time series.
        :param int max_evals: Number of iterations for hyperparameter optimization.
        :return: Optimal hyperparameters.
        :rtype: dict

        >>> data
        [[Timestamp('2020-01-01 00:00:00'), 1326.0],
        [Timestamp('2020-01-02 00:00:00'), 1552.0],
        [Timestamp('2020-01-03 00:00:00'), 1432.0],
        . . . ,
        [Timestamp('2020-06-06 00:00:00'), 1747.0],
        [Timestamp('2020-06-07 00:00:00'), 1782.0]]
        >>> hopt_obj = HyperparameterOptimization(freq='D', detection_type='OutlierDetection')
        >>> hyper_params = hopt_obj._run(data=data, max_evals=5)

        >>> hyper_params
        {'LuminaireModel': 'LADStructuralModel', 'data_shift_truncate': 0, 'fill_rate': 0.8409249603686499,
        'include_holidays_exog': 1, 'is_log_transformed': 1, 'max_ft_freq': 3, 'p': 4, 'q': 3}
        """

        # Calling data exploration to perform imputation only
        de_obj = DataExploration(freq=self.freq,
                                 detection_type=self.detection_type)
        data, summary = de_obj.profile(df=data, impute_only=True)

        if summary['success']:
            return self._optimize(data=data,
                                  objective_part=self._objective_part,
                                  max_evals=max_evals)
        else:
            return None
Exemplo n.º 2
0
    def _training(self, data, ts_start, ts_end, min_ts_length=None, min_ts_mean=None, min_ts_mean_window=None,
                  max_ft_freq=None, include_holidays=None, optimize=None, **kwargs):
        """
        This function performs the training for the input time series

        :param pandas.DataFrame data: Input time series data
        :param str ts_start: Start of the time series data
        :param str ts_end: End of the time series data
        :param int min_ts_length: Minimum required length for the time series data
        :param float min_ts_mean: Minimum mean value for the time series data for training over a fixed length window
        (min_ts_mean_window)
        :param int min_ts_mean_window: Length of the window to check min_ts_mean
        :param int max_ft_freq: Maximum number of frequency for the Fouries transformation
        :param bool is_log_transformed: Flag for log transformation
        :param bool optimize: Flag to identify whether called from hyperparameter optimization
        :return: Trained model and optimal LAD structural model order (p, d, q)
        :rtype: tuple[dict, tuple[int]]
        """
        from numpy.linalg import LinAlgError

        freq = self._params['freq']

        try:

            if data is None:
                raise ValueError('Not enough data to train due to recent change point')

            endog = data[self._imputed_metric]

            index = pd.date_range(start=ts_start, end=ts_end, freq=freq)  # Holidays are always daily.

            de_obj = DataExploration()
            exog_data = de_obj._get_exog_data(ts_start, ts_end, index) if self._params[
                'include_holidays_exog'] else None

            # always run the model first without holiday exogenous variables
            result, order = self._fit(endog=endog, endog_end=ts_end, min_ts_mean=min_ts_mean,
                                      min_ts_mean_window=min_ts_mean_window, include_holidays=include_holidays,
                                      min_ts_length=min_ts_length, max_ft_freq=max_ft_freq, exog_data=exog_data,
                                      optimize=optimize)

            result['training_tail'] = data.loc[:ts_end].values.tolist()[-3:]

        except(LinAlgError, ValueError, LADStructuralError) as e:
            result = {'ErrorMessage': str(e)}
            return result, None

        return result, order
Exemplo n.º 3
0
    def _fit(self,
             endog,
             endog_end,
             min_ts_mean,
             min_ts_mean_window,
             include_holidays=False,
             min_ts_length=None,
             max_ft_freq=None,
             exog_data=None,
             optimize=None):
        """
        This function implements the fourier transformation to model the periodicities and implements ARIMA model with
        different order of differencing, MA and AR terms to generate the optimal prediction and anomaly detection.
        :param list endog: A list containing the time series input
        :param str endog_end: pandas datetime containing the last timestamp of the input time series
        :param float raw_actual: Containing the actual value of the execution date
        :param float raw_actual_previous: Containing the actual value of the day before execution date
        :param float interpolated_actual: Containing the interpolated value of the execution date
        :param pandas.Dttaframe pred_instance_date: pandas datetime containing the prediction timestamp
        :param float min_ts_mean: The minimum mean value of the time series required for the model to run. For data that
        originated as integers (such as counts), the ARIMA model can behave erratically when the numbers are small. When
        this parameter is set, any time series whose mean value is less than this will automatically result in a model
        failure, rather than a mostly bogus anomaly.
        :param int min_ts_mean_window: The number of observations (anchored to the end of the time series) to use when
        applying the min_ts_mean rule. Default is None, which performs the calculation on the entire time series.
        :param bool include_holidays: Whether to include holidays as exogenous variables in the regression. Holidays
        are defined in :class:`~luminaire.model.model_utils.LADsHolidays`
        :param int min_ts_length: Specifying the minimum required length of the time series for training
        :param int max_ft_freq: The maximum number of frequencies under consideration for the Fourier transformation.
        :param bool optimize: Flag to identify whether called from hyperparameter optimization
        :return: A dictionary containing the anomaly flag, details of the prediction data (timestamp, raw, interpolated)
        lower and upper bound of the confidence interval, flag whether holidays are included in the model as exogenous
        """
        import numpy as np
        from pykalman import KalmanFilter
        import warnings
        warnings.filterwarnings('ignore')

        p, q = self._params['p'], self._params['q']
        freq = self._params['freq']
        pred_len = self.max_scoring_length
        x_matrix_train = None
        x_matrix_score = None

        # set exogenous (holiday) variables for input data
        if include_holidays and len(endog) + pred_len > 385:
            exog = exog_data.loc[endog.index.min():endog_end]
        else:
            include_holidays = False
            exog = None

        if min_ts_length is not None and len(endog) < min_ts_length:
            raise ValueError(
                'TimeSeries length less than minimum length specified')

        if min_ts_mean is not None:
            if (min_ts_mean_window is not None and endog[-min_ts_mean_window:].fillna(0).mean() < min_ts_mean) or \
                    (min_ts_mean_window is None and endog.fillna(0).mean() < min_ts_mean):
                raise ValueError('Metric values too small to model.')

        # Smoothing the given time series as a pre-processing for modeling seasonalities through Fourier
        # transformation
        kf = KalmanFilter()
        endog_smoothed, filtered_state_covariances = kf.em(endog).smooth(endog)
        endog_smoothed = endog_smoothed[:, 0]

        endog, diff_order, actual_previous_per_diff = DataExploration._stationarizer(
            endog=pd.Series(endog), diff_min=0, diff_max=1, obs_incl=False)
        if diff_order:
            endog_smoothed = np.diff(endog_smoothed)

        if freq == 'D':
            complete_cycle = int(len(endog) / 7)
            endog = endog[-(complete_cycle * 7):]
            endog_smoothed = endog_smoothed[-(complete_cycle * 7):]
        elif freq == 'H':
            complete_cycle = int(len(endog) / 24)
            endog = endog[-(complete_cycle * 24):]
            endog_smoothed = endog_smoothed[-(complete_cycle * 24):]

        exog = exog.iloc[-len(endog):] if exog is not None else None

        if include_holidays:
            exog = exog.loc[:, (exog != 0).any(axis=0)]
            ext_training_features = list(exog.columns)
        else:
            ext_training_features = None

        stepwise_fit = []

        # Updating the user specified maximum number of frequencies to consider for the Fourier transformation
        # based on the length of the smoothed endogenous variable
        max_ft_freq = int(min(max_ft_freq, len(endog_smoothed) / 4))

        # Running the Fourier transformation extrapolating one point ahead in future that is going to be used
        # for predicting

        if max_ft_freq > 0:
            x_matrix = self._fourier_extp(series=endog_smoothed,
                                          max_trun=(2 * max_ft_freq),
                                          forecast_period=pred_len)
            if not optimize and np.all(x_matrix[0] == x_matrix[0][0]):
                x_matrix_train = None
                x_matrix_score = None
                max_ft_freq = 0
            else:
                x_matrix_train = x_matrix[:, :(x_matrix.shape[1] - pred_len)]
                x_matrix_score = x_matrix[:, (x_matrix.shape[1] - pred_len):]

        self._seasonal_arima(endog=endog,
                             exog=exog,
                             p=p,
                             d=0,
                             q=q,
                             imodels=max_ft_freq,
                             include_holidays=include_holidays,
                             ift_matrix=x_matrix_train,
                             stepwise_fit=stepwise_fit,
                             optimize=optimize)
        model = stepwise_fit[0]

        seasonal_feature_scoring = x_matrix_score[
            0, :].tolist() if not x_matrix_score is None else None

        result = {
            'model': model,
            'diff_order': diff_order,
            'seasonal_feature_scoring': seasonal_feature_scoring,
            'ext_training_features': ext_training_features,
        }

        p_selected = model.k_ar if hasattr(model, 'k_ar') else 0
        d_selected = diff_order
        q_selected = model.k_ma if hasattr(model, 'k_ma') else 0
        order = (p_selected, d_selected, q_selected)

        return result, order
Exemplo n.º 4
0
    def _predict(cls, model, is_log_transformed,
                 raw_actual, interpolated_actual,
                 training_end=None, seasonal_feature_scoring=None, pred_date=None, order_of_diff=None,
                 training_tail=None, ext_training_features=None, pred_len=None, freq=None,
                 include_holidays_exog=None):
        """
        This function performs the prediction and anomaly detection for a given prediction date and a time point

        :param python object model: LAD structural model object
        :param bool is_log_transformed: Flag for log transformation
        :param float raw_actual: Observed value of the time point
        :param float interpolated_actual: interpolated value of the time point
        :param str training_end: Last time series timestamp
        :param list seasonal_feature_scoring: Fourier features
        :param str pred_date: Prediction date
        :param int order_of_diff: Order of differencing for the nonstationarity property of the given time series
        :param list training_tail: Padding from latest time series observed values for prediction
        :param pandas.DataFrame ext_training_features: External exogenous variables
        :param int pred_len: Length of time the prediction need to be generated for
        :param str freq: Frequency of the observed time series
        :param bool include_holidays_exog: Flag to include holidays as exogenous in the model
        :return: Model result
        :rtype: dict
        """

        import numpy as np
        import pandas as pd
        import scipy.stats as st
        from numpy.linalg import LinAlgError
        import math

        alpha = cls._sig_level
        alpha_extreme = cls._sig_level_extreme

        include_holidays_exog = include_holidays_exog if ext_training_features else 0

        index = pd.date_range(start=training_end, end=pred_date, freq=freq)[1:]  # Holidays are always daily.

        de_obj = DataExploration()
        pred_exog = de_obj._get_exog_data(pred_date, pred_date, index) if include_holidays_exog else None

        if pred_exog is not None and set(pred_exog.columns.values) != set(ext_training_features):
            missing_col_list = list(set(ext_training_features) - set(pred_exog.columns.values))
            common_cols = list(set(ext_training_features).intersection(set(pred_exog.columns.values)))
            temp_df = pred_exog[common_cols]
            missing_feat_df = pd.DataFrame(np.zeros([len(pred_exog), len(missing_col_list)]),
                                           columns=missing_col_list, index=pred_exog.index.values)
            pred_exog = pd.concat([temp_df, missing_feat_df], axis=1)
            pred_exog = pred_exog[ext_training_features]

        freq = "1" + freq if not any(char.isdigit() for char in freq) else freq

        forecast_ndays = int((pred_date - pd.Timestamp(training_end)) / pd.Timedelta(freq))
        model_freshness = forecast_ndays / float(pred_len)

        try:
            if forecast_ndays > pred_len:
                raise ValueError('Current trained model object expired')

            float_min = 1e-10

            # set exogenous (holiday) variables for input data
            if include_holidays_exog:
                pred_exog = pred_exog.loc[pd.Timestamp(training_end) + pd.Timedelta(freq): pred_date]
            else:
                pred_exog = None

            if seasonal_feature_scoring:
                if not include_holidays_exog:
                    pred_exog = seasonal_feature_scoring[:forecast_ndays]
                else:
                    pred_exog['fourier_feature'] = seasonal_feature_scoring[:forecast_ndays]

            forecast = list(model.forecast(steps=forecast_ndays, alpha=alpha, exog=pred_exog))
            interpolated_training_data = list(zip(*training_tail))[1]

            for order in list(reversed(range(order_of_diff))):
                training_data_diff = np.diff(interpolated_training_data,
                                             order) if order > 0 else interpolated_training_data

                forecast_diff_mean = [training_data_diff[-1]]
                forecast_diff_ci = []

                for i in range(forecast_ndays):
                    forecast_diff_mean.append(forecast_diff_mean[-1] + forecast[0][i])
                    forecast_diff_ci.append([forecast_diff_mean[-1] -
                                             (st.norm.ppf(1 - (alpha / 2.0)) * forecast[1][i]),
                                             forecast_diff_mean[-1] +
                                             (st.norm.ppf(1 - (alpha / 2.0)) * forecast[1][i])])
                forecast[0] = forecast_diff_mean[1:]
                forecast[2] = forecast_diff_ci

            if is_log_transformed:
                transformed_back_forecast = np.exp(forecast[0][-1] + ((forecast[1][-1] ** 2) / 2.0)) - 1
                transformed_back_std_err = np.sqrt((np.exp(forecast[1][-1] ** 2) - 1) * (np.exp((2 * forecast[0][-1]) +
                                                                                                (forecast[1][
                                                                                                     -1] ** 2))))
                transformed_back_CILower = transformed_back_forecast - \
                                           st.norm.ppf(1 - (alpha / 2.0), 0, transformed_back_std_err) \
                    if transformed_back_std_err != 0 else transformed_back_forecast
                transformed_back_CIUpper = transformed_back_forecast + \
                                           st.norm.ppf(1 - (alpha / 2.0), 0, transformed_back_std_err) \
                    if transformed_back_std_err != 0 else transformed_back_forecast
                transformed_back_interpolated_actual = float(np.exp(interpolated_actual) - 1)
            if np.sum(np.isnan(forecast[0][-1])) or np.isnan(forecast[1][-1]):
                raise ValueError('Predicted null value')

            if is_log_transformed:
                zscore = (transformed_back_interpolated_actual -
                          transformed_back_forecast) / max(float(transformed_back_std_err), float_min)

                anomaly_probability = (2 * st.norm(0, 1).cdf(abs(zscore))) - 1
                if math.isnan(anomaly_probability) or math.isnan(transformed_back_CILower) \
                        or math.isnan(transformed_back_CIUpper):
                    raise ValueError('Either Anomaly probability or CILower or CIUpper is NaN under log transform')
                down_anomaly_probability = 1 - st.norm(0, 1).cdf(zscore)
                up_anomaly_probability = st.norm(0, 1).cdf(zscore)

                result = {'Success': True,
                          'IsLogTransformed': is_log_transformed,
                          'LogTransformedAdjustedActual': interpolated_actual,
                          'LogTransformedPrediction': float(forecast[0][-1]),
                          'LogTransformedStdErr': float(forecast[1][-1]),
                          'LogTransformedCILower': float(forecast[2][-1][0]),
                          'LogTransformedCIUpper': float(forecast[2][-1][1]),
                          'AdjustedActual': transformed_back_interpolated_actual,
                          'Prediction': float(transformed_back_forecast) if not float(
                              transformed_back_forecast) == float('inf') else 0.0,
                          'StdErr': float(transformed_back_std_err) if not float(
                              transformed_back_std_err) == float('inf') else 0.0,
                          'CILower': float(transformed_back_CILower) if not float(
                              transformed_back_CILower) == float('-inf') else 0.0,
                          'CIUpper': float(transformed_back_CIUpper) if not float(
                              transformed_back_CIUpper) == float('inf') else 0.0,
                          'ConfLevel': float(1.0 - alpha) * 100,
                          'ExogenousHolidays': include_holidays_exog,
                          'IsAnomaly': bool(anomaly_probability > 1 - alpha),
                          'IsAnomalyExtreme': bool(anomaly_probability > 1 - alpha_extreme),
                          'AnomalyProbability': 1 if raw_actual is None else float(anomaly_probability),
                          'DownAnomalyProbability': 1 if raw_actual is None else float(down_anomaly_probability),
                          'UpAnomalyProbability': 1 if raw_actual is None else float(up_anomaly_probability),
                          'ModelFreshness': model_freshness}

            else:
                zscore = (interpolated_actual - forecast[0][-1]) / max(float(forecast[1][-1]), float_min)

                anomaly_probability = (2 * st.norm(0, 1).cdf(abs(zscore))) - 1
                if math.isnan(anomaly_probability) or math.isnan(forecast[2][-1][0]) or math.isnan(forecast[2][-1][1]):
                    raise ValueError('Either Anomaly probability or CILower or CIUpper is NaN')

                down_anomaly_probability = 1 - st.norm(0, 1).cdf(zscore)
                up_anomaly_probability = st.norm(0, 1).cdf(zscore)

                result = {'Success': True,
                          'IsLogTransformed': is_log_transformed,
                          'AdjustedActual': interpolated_actual,
                          'Prediction': float(forecast[0][-1]) if not float(
                              forecast[0][-1]) == float('inf') else 0.0,
                          'StdErr': float(forecast[1][-1]) if not float(
                              forecast[1][-1]) == float('inf') else 0.0,
                          'CILower': float(forecast[2][-1][0]) if not float(
                              forecast[2][-1][0]) == float('-inf') else 0.0,
                          'CIUpper': float(forecast[2][-1][1]) if not float(
                              forecast[2][-1][1]) == float('inf') else 0.0,
                          'ConfLevel': float(1.0 - alpha) * 100,
                          'ExogenousHolidays': include_holidays_exog,
                          'IsAnomaly': bool(anomaly_probability > 1 - alpha),
                          'IsAnomalyExtreme': bool(anomaly_probability > 1 - alpha_extreme),
                          'AnomalyProbability': 1 if raw_actual is None else float(anomaly_probability),
                          'DownAnomalyProbability': 1 if raw_actual is None else float(down_anomaly_probability),
                          'UpAnomalyProbability': 1 if raw_actual is None else float(up_anomaly_probability),
                          'ModelFreshness': model_freshness}

        except (LinAlgError, ValueError, LADStructuralError) as e:
            result = {'Success': False,
                      'AdjustedActual': interpolated_actual,
                      'ErrorMessage': str(e)}

        return result
Exemplo n.º 5
0
    def _get_result(self,
                    input_df=None,
                    detrend_order=None,
                    ma_forecast_adj=None,
                    value_column=None,
                    ma_window_length=None,
                    detrend_method=None,
                    baseline_type=None,
                    detection_method=None,
                    baseline=None,
                    anomaly_scores_mean=None,
                    anomaly_scores_sd=None,
                    training_tail=None):
        """
        The function scores the scoring window for anomalies based on the training metrics and the baseline
        :param pandas.DataFrame input_df: Input data containing the training and the scoring data.
        :param list scoring_window: A list containing the start and the end of the scoring window.
        :param int detrend_order: The order of detrending based on MA or differencing method.
        :param ma_forecast_adj: Adjustment for the forecasting window in case MA based detrending applied.
        :param str value_column: Column containing the values.
        :param int ma_window_length: Length of the moving average window to be used for detrending.
        :param str detrend_method: Selects between "ma" or "diff" detrend method.
        :param str baseline_type: Selects between "aggregated" or "last_window" baseline.
        :param str detection_method: Selects between "kldiv" or "sign_test" distance method.
        :param list baseline: A list storing a baseline window used to score the scoring window.
        :param float anomaly_scores_mean: Mean of the anomaly scores between training sub-windows.
        :param float anomaly_scores_sd: Standard deviation of the anomaly scores between training sub-windows.
        :param float significance_level: Significance level for anomaly detection
        :param list training_tail: Last training sub-window.
        :return: Returns the probability of anomaly with the corresponding anomaly probability.
        :rtype: tuple(bool, float)
        """

        import numpy as np
        import pandas as pd
        import scipy.stats as st

        is_anomaly = False
        execution_data = input_df[value_column]

        if detrend_method == 'diff':
            # Obtain the execution data and perform the necessary differencing
            execution_data = list(execution_data)
            execution_data = np.diff(execution_data, detrend_order).tolist() if detrend_order > 0 \
                else execution_data
        elif detrend_method == 'ma':
            if detrend_order > 0:
                execution_data = execution_data.to_list()
                mock_ma_window_left = np.array(
                    training_tail[-int(ma_window_length /
                                       2.0):]) * ma_forecast_adj
                mock_ma_window_right = np.array(
                    training_tail[:int(ma_window_length /
                                       2.0)]) * ma_forecast_adj
                ma_execution_data = list(
                    mock_ma_window_left) + execution_data + list(
                        mock_ma_window_right)

                de_obj = DataExploration()
                execution_data = de_obj._ma_detrender(
                    series=execution_data,
                    padded_series=ma_execution_data,
                    ma_window_length=ma_window_length)
            else:
                execution_data = list(execution_data)

        # Kl divergence based anomaly detection
        if detection_method == "kldiv":
            current_anomaly_score = self._distance_function(
                data=execution_data, called_for="scoring", baseline=baseline)
            if current_anomaly_score > st.norm.ppf(1 - self.sig_level,
                                                   anomaly_scores_mean,
                                                   anomaly_scores_sd):
                is_anomaly = True

            prob_of_anomaly = st.norm.cdf(current_anomaly_score,
                                          anomaly_scores_mean,
                                          anomaly_scores_sd)
        # Sign test based anomaly detection
        elif detection_method == "sign_test":
            # If last window is the baseline, we perform the Wilcoxon sign rank test for means and levene
            # test for variance to detect anomalies
            if baseline_type == "last_window":
                test_stat_wilcoxon, pvalue_wilcoxon = st.wilcoxon(
                    execution_data, baseline)
                test_stat_levene, pvalue_levene = st.levene(
                    execution_data, baseline)
                if pvalue_wilcoxon < self.sig_level or pvalue_levene < self.sig_level:
                    is_anomaly = True
                prob_of_anomaly = 1 - min(pvalue_wilcoxon, pvalue_levene)
            # If aggregated is the baseline, we perform the Wilcoxon sign rank test for means and gamma distribution
            # based test for the past standard deviations to detect anomalies
            elif baseline_type == "aggregated":
                baseline_mean = np.array(baseline).mean(0).tolist()
                baseline_sds = np.array(baseline).std(1).tolist()
                test_stat_wilcoxon, pvalue_wilcoxon = st.wilcoxon(
                    execution_data, baseline_mean)
                gamma_alpha, gamma_loc, gamma_beta = st.gamma.fit(baseline_sds)
                pvalue_gamma = 1 - st.gamma.cdf(
                    np.std(execution_data), gamma_alpha, gamma_loc, gamma_beta)
                if pvalue_wilcoxon < self.sig_level or pvalue_gamma < self.sig_level:
                    is_anomaly = True
                prob_of_anomaly = 1 - min(pvalue_wilcoxon, pvalue_gamma)

        return is_anomaly, prob_of_anomaly
Exemplo n.º 6
0
    def train(self, data, **kwargs):
        """
        Input time series for training.

        :param pandas.DataFrame data: Input time series.
        :return: Training summary with a success flag.
        :rtype: tuple(bool, python model object)

        >>> data
                                raw interpolated
        index
        2017-10-02 00:00:00  118870       118870
        2017-10-02 01:00:00  121914       121914
        2017-10-02 02:00:00  116097       116097
        2017-10-02 03:00:00   94511        94511
        2017-10-02 04:00:00   68330        68330
        ...                     ...          ...
        2018-10-10 19:00:00  219908       219908
        2018-10-10 20:00:00  219149       219149
        2018-10-10 21:00:00  207232       207232
        2018-10-10 22:00:00  198741       198741
        2018-10-10 23:00:00  213751       213751
        >>> hyper_params = WindowDensityHyperParams(freq='H').params
        >>> wdm_obj = WindowDensityModel(hyper_params=hyper_params)
        >>> success, model = wdm_obj.train(data)

        >>> success, model
        (True, <luminaire.model.window_density.WindowDensityModel object at 0x7fd7c5a34e80>)
        """
        import numpy as np
        import pandas as pd

        freq = self._params['freq']
        min_num_train_windows = self.min_num_train_windows
        max_num_train_windows = self.max_num_train_windows
        ignore_window = self._params['ignore_window']
        if freq in ['S', 'T', '15T', 'H', 'D']:
            min_window_length = self._params['min_window_length']
            max_window_length = self._params['max_window_length']
            window_length = self._params['window_length']
        else:
            min_window_length = self._params['min_window_length']
            max_window_length = self._params['max_window_length']
            window_length = self._params['window_length']
            if not min_window_length or not max_window_length or not window_length:
                raise ValueError(
                    'Training window length with min and max should be specified in case frequency not in the '
                    'specified list')
        is_log_transformed = self._params['is_log_transformed']
        max_missing_train_prop = self._params['max_missing_train_prop']
        detrend_method = self._params['detrend_method']
        target_metric = 'raw'
        imputed_metric = 'interpolated'
        if freq not in ['S', 'T', '15T', 'H', 'D']:
            detection_method = self._params['detection_method']
            if not detection_method:
                raise ValueError(
                    'Detection method should be specified in case frequency not in the specified list'
                )
            if detrend_method == 'ma':
                ma_window_length = self._params['ma_window_length']
                if not ma_window_length:
                    raise ValueError(
                        'Moving average window length should be specified for detrending for frequency not in the '
                        'specified list')

        if len(data) == 0:
            model = {'ErrorMessage': 'DataFrame length is 0'}
            success = False
            return success, WindowDensityModel(**model)

        if np.sum(np.isnan(data[target_metric])) > max_missing_train_prop:
            raise ValueError(
                'Too few observed data in the training time series')
        else:
            de_obj = DataExploration()
            df = de_obj._kalman_smoothing_imputation(
                df=data,
                target_metric=target_metric,
                imputed_metric=imputed_metric)

        # Shift the interpolated value by +1 and get the log. This handles values with 0.
        if is_log_transformed:
            neg_flag = True if not data[
                data[target_metric] < 0].empty else False
            df[imputed_metric] = df[imputed_metric] if neg_flag else np.log(
                df[imputed_metric] + 1)

        past_anomaly_scores, anomaly_scores_mean, anomaly_scores_sd, detrend_order, baseline, ma_forecast_adj, \
        training_tail, training_start, training_end = self._call_training(df=df, window_length=window_length,
                                                                          min_window_length=min_window_length,
                                                                          max_window_length=max_window_length,
                                                                          min_num_train_windows=min_num_train_windows,
                                                                          max_num_train_windows=max_num_train_windows,
                                                                          ignore_window=ignore_window,
                                                                          imputed_metric=imputed_metric,
                                                                          detrend_method=detrend_method, **kwargs)

        success = True
        self.hyper_params['is_log_transformed'] = is_log_transformed
        model = {
            'ModelInstanceTimestamp':
            pd.Timestamp(training_end).time().strftime('%H:%M:%S'),
            'TrainingStartDate':
            training_start,
            'TrainingEndDate':
            training_end,
            'PastAnomalyScores':
            past_anomaly_scores,
            'AnomalyScoresMean':
            float(anomaly_scores_mean) if anomaly_scores_mean else None,
            'AnomalyScoresSD':
            float(anomaly_scores_sd) if anomaly_scores_sd else None,
            'NonStationarityOrder':
            detrend_order,
            'Baseline':
            baseline,
            'MovAvgForecastAdj':
            ma_forecast_adj,
            'TrainingTail':
            training_tail
        }

        return success, WindowDensityModel(hyper_params=self.hyper_params,
                                           **model)
Exemplo n.º 7
0
    def _get_model(self,
                   input_df=None,
                   training_window=None,
                   window_length=None,
                   ignore_window=None,
                   value_column=None,
                   ma_window_length=None,
                   detrend_method=None,
                   baseline_type=None,
                   detection_method=None):
        """
        This function runs the training process given the input parameters.
        :param pandas.DataFrame input_df: Input data containing the training and the scoring data.
        :param list training_window: A list containing the start and the end of the training window.
        :param int window_length: The length of a training sub-window / scoring window.
        :param int ignore_window: ignore a time window to be considered for training.
        :param str value_column: Column containing the values.
        :param int ma_window_length: Length of the moving average window to be used for detrending.
        :param str detrend_method: Selects between "ma" or "diff" detrend method.
        :param str baseline_type: Selects between "aggregated" or "last_window" baseline.
        :param str detection_method: Selects between "kldiv" or "sign_test" distance method.
        :return: Returns past anomaly scores based on training data, baseline and other related metrics.
        :rtype: tuple(list, float, float, int, list, float)
        """
        import numpy as np
        import pandas as pd
        from itertools import chain

        # Obtaining and slicing the training data based on the window size

        training_df = input_df[pd.to_datetime(training_window[0]):pd.
                               to_datetime(training_window[1])]

        training_data = list(training_df[value_column])

        de_obj = DataExploration()
        sliced_training_data = de_obj._partition(training_data, window_length)
        sliced_training_data_normal = []

        # Cleaning the training data given a externally specified bad training sub-window
        for i in range(0, len(sliced_training_data)):
            if not (ignore_window is None):
                if (i + 1) not in ignore_window:
                    sliced_training_data_normal.append(sliced_training_data[i])
            else:
                sliced_training_data_normal.append(sliced_training_data[i])

        de_obj = DataExploration()

        # performing the stationarity test
        sliced_training_data_cleaned, detrend_order, ma_forecast_adj = de_obj._detrender(
            training_data_sliced=sliced_training_data_normal,
            ma_window_length=ma_window_length,
            significance_level=0.05,
            detrend_method=detrend_method,
            train_subwindow_len=window_length)

        # Obtain the past anomaly scores and the anomaly means and standard deviation if the detection method
        # is KL divergence
        if detection_method == "kldiv":
            past_anomaly_scores = np.array(
                self._distance_function(data=sliced_training_data_cleaned,
                                        called_for="training"))
            past_anomaly_scores = past_anomaly_scores[
                past_anomaly_scores < np.percentile(
                    past_anomaly_scores, 90, interpolation='midpoint')].tolist(
                    )
            anomaly_scores_mean = np.mean(past_anomaly_scores)
            anomaly_scores_sd = np.std(past_anomaly_scores, ddof=1)
        else:
            past_anomaly_scores, anomaly_scores_mean, anomaly_scores_sd = None, None, None

        # If aggregated baseline type is specified, we take the whole training window as a baseline, else we
        # take the last training sub window from the sliced training data
        if baseline_type == "aggregated":
            sliced_training_data_cleaned = self._training_data_truncation(
                sliced_training_data=sliced_training_data_cleaned)
            if detection_method == "kldiv":
                baseline = list(
                    chain.from_iterable(sliced_training_data_cleaned))
            elif detection_method == "sign_test":
                baseline = sliced_training_data_cleaned
        elif baseline_type == "last_window":
            baseline = sliced_training_data_cleaned[-1]

        return past_anomaly_scores, anomaly_scores_mean, anomaly_scores_sd, detrend_order, baseline, ma_forecast_adj
    def _objective_part(self, data, smoothed_series, args):
        """
        This is the objective function that outputs the loss for a giveen set of hyperparameters for optimization
        through hyperopt

        :param pandas.DataFrame data: Input time series data
        :param list smoothed_series: Input time series after smoothing
        :param args:
        :return: AUC based on observed (synthetic) and predicted anomalies
        :rtype: dict

        >>> data
                          raw
        2016-01-02  1753421.0
        2016-01-03  1879108.0
        2016-01-04  1462725.0
        2016-01-05  1525162.0
        2016-01-06  1424264.0
        ...               ...
        2018-10-24  1726884.0
        2018-10-25  1685651.0
        2018-10-26  1632952.0
        2018-10-27  1850912.0
        2018-10-28  2021929.0

        >>> {'loss': 1 - auc, 'status': STATUS_OK}
        {'loss': 0.3917824074074072, 'status': 'ok'}
        """

        import numpy as np
        import pandas as pd
        from sklearn.metrics import log_loss
        import copy

        is_log_transformed = args[0]
        data_shift_truncate = args[1]
        fill_rate = args[2]

        # Getting hyperparameters for lad structural model
        if args[3]['model'] == 'LADStructuralModel':
            max_ft_freq = args[3]['param']['max_ft_freq']
            include_holidays_exog = args[3]['param']['include_holidays_exog']
            p = args[3]['param']['p']
            q = args[3]['param']['q']

        ts_start = data.index.min()
        ts_end = data.index.max()
        max_ts_length = self.max_ts_length
        min_ts_length = self.min_ts_length
        freq = self.freq
        scoring_length = self.scoring_length

        train_end = (
            pd.Timestamp(ts_end) -
            pd.Timedelta("{}".format(scoring_length) + freq)).to_pydatetime()
        score_start = (pd.Timestamp(train_end) +
                       pd.Timedelta("1" + freq)).to_pydatetime()

        training_data = data.loc[ts_start:train_end]
        scoring_data = data.loc[score_start:ts_end]

        try:
            # Required data preprocessing before training and scoring
            de_obj = DataExploration(freq=self.freq,
                                     min_ts_length=self.min_ts_length,
                                     min_ts_mean=self.min_ts_mean,
                                     max_ts_length=self.max_ts_length,
                                     is_log_transformed=is_log_transformed,
                                     data_shift_truncate=data_shift_truncate,
                                     detection_type=self.detection_type,
                                     fill_rate=fill_rate)
            training_data, preprocess_summary = de_obj.profile(
                df=training_data)

            is_log_transformed = preprocess_summary['is_log_transformed']

            # Getting De-noised smoothed series for generating synthetic anomalies
            smoothed_scoring_series = smoothed_series[-len(scoring_data):]

            labels = []
            probs = []

            if args[3]['model'] == 'LADStructuralModel':
                # LAD structural training and scoring
                hyper_params = LADStructuralHyperParams(
                    is_log_transformed=is_log_transformed,
                    max_ft_freq=max_ft_freq,
                    include_holidays_exog=include_holidays_exog,
                    p=p,
                    q=q)
                lad_struct = LADStructuralModel(hyper_params.params,
                                                max_ts_length=max_ts_length,
                                                min_ts_length=min_ts_length,
                                                freq=freq)
                success, model_date, model = lad_struct.train(
                    data=training_data, optimize=True, **preprocess_summary)

                scr_idx = 0

                obs = []
                preds = []
                # Scoring and anomaly classification for synthetic anomalies
                for i, row in scoring_data.iterrows():
                    observed_value = row.raw
                    obs.append(observed_value)
                    result = model.score(observed_value, i)
                    prediction = result['Prediction']
                    preds.append(prediction)
                    std_error = result['StdErr']
                    observation = smoothed_scoring_series[scr_idx]
                    scr_idx = scr_idx + 1
                    anomaly_flags, anomaly_probabilities = self._synthetic_anomaly_check(
                        prediction=prediction,
                        std_error=std_error,
                        observation=observation)
                    labels = labels + anomaly_flags
                    probs = probs + anomaly_probabilities

                mdape = self._mdape(obs, preds)
            elif args[3]['model'] == 'LADFilteringModel':
                # LAD filtering training and scoring
                hyper_params = LADFilteringHyperParams(
                    is_log_transformed=is_log_transformed)
                lad_filtering = LADFilteringModel(hyper_params.params,
                                                  max_ts_length=max_ts_length,
                                                  min_ts_length=min_ts_length,
                                                  freq=freq)

                success, model_date, stable_model = lad_filtering.train(
                    training_data, **preprocess_summary)
                # Scoring and anomaly classification for synthetic anomalies
                for prop in self.anomaly_intensity_list:
                    anomaly_flags_list = []
                    anomaly_probabilities_list = []
                    local_model = copy.deepcopy(stable_model)
                    for i, row in scoring_data.iterrows():
                        trial_prob = np.random.uniform(0, 1, 1)
                        observed_value = row.raw
                        synthetic_actual = observed_value
                        if trial_prob < 0.4:
                            synthetic_actual = observed_value + (
                                prop * observed_value)
                            anomaly_flags_list.append(1)
                        else:
                            anomaly_flags_list.append(0)

                        result, local_model = local_model.score(
                            observed_value=observed_value,
                            pred_date=i,
                            synthetic_actual=synthetic_actual)
                        anomaly_probabilities_list.append(
                            result['AnomalyProbability'])

                    labels = labels + anomaly_flags_list
                    probs = probs + anomaly_probabilities_list

            weights = ((1 - np.array(labels)) + 1) / float(len(labels))
            if args[3]['model'] == 'LADStructuralModel' and mdape:
                cost = (0.5 * mdape) + (
                    0.5 * log_loss(labels, probs, sample_weight=weights))
            else:
                cost = log_loss(labels, probs, sample_weight=weights)

        except Exception as e:
            return {'loss': 1e100, 'status': STATUS_OK}

        return {'loss': cost, 'status': STATUS_OK}
Exemplo n.º 9
0
    def _training(self, data, **kwargs):
        """
        This function implements Kalman filter based estimation algorithm over a Markovian State Space model and
        analyzes the residual process of the model with respect to a Gaussian process to perform anomaly detection
        :param pandas.DataFrame data: Input time seires to analyze for anomaly
        :param float sig_level: Significance level to be considered for anomaly detection based on the Gaussian process
        :return: A tuple containing a flag whether the datapoint on the given date is an anomnaly, the prediction and
        the standard error of prediction
        """

        import numpy as np
        from pykalman import KalmanFilter
        from numpy.linalg import LinAlgError

        if data is None:
            raise ValueError(
                'Not enough data to train due to recent change point')

        data = data[self._imputed_metric]

        last_data_points = data[-2:].values.tolist()

        try:
            data_dim = 1
            transition_matrix = [[1]]

            de_obj = DataExploration()
            endog, diff_order, actual_previous_per_diff = de_obj._stationarizer(
                data)

            kf = KalmanFilter(transition_matrices=transition_matrix,
                              initial_state_mean=np.zeros(data_dim),
                              n_dim_obs=data_dim)

            # Obtaining the hidden states and their covariance based on the Kalman Filter algorithm
            filtered_state_means, filtered_state_covariance = kf.em(
                endog).filter(endog)

            # Obtaining the observation matirx, transition covariance and the observation covariance
            observation_matrix = kf.observation_matrices
            transition_covariance = kf.transition_covariance
            observation_covariance = kf.observation_covariance

            prior_pred, pred_covariance, kalman_gain \
                = self._prediction_summary(state_mean=filtered_state_means[:, 0][-1],
                                           state_covariance=filtered_state_covariance[-1, :, :],
                                           observation_covariance=observation_covariance,
                                           transition_covariance=transition_covariance,
                                           observation_matrix=observation_matrix,
                                           transition_matrix=transition_matrix)

            result = {
                'model': kf,
                'state_mean': float(filtered_state_means[:, 0][-1]),
                'state_covariance':
                filtered_state_covariance[-1, :, :].tolist(),
                'transition_matrix': transition_matrix,
                'prior_pred': float(prior_pred),
                'pred_covariance': pred_covariance.tolist(),
                'kalman_gain': kalman_gain.tolist(),
                'diff_order': diff_order,
                'last_data_points': last_data_points
            }

        except (LinAlgError, ValueError, LADFilteringModelError) as e:
            result = {'ErrorMessage': str(e)}

        return result