def windowed_gaussian(ts_obj, gaussian_window_size, step_size, plot_anomaly_score=False):
    if ts_obj.miss:
        raise ValueError("Missing time steps. Cannot use Windowed Gaussian.")
    start = time.time()

    anomaly_scores = ah.determine_anomaly_scores_error(ts_obj.dataframe["value"].values, [0] * ts_obj.get_length(), ts_obj.get_length(), gaussian_window_size, step_size=step_size)

    # why is the below here?
    # anomaly_scores = np.nan_to_num(anomaly_scores)

    end = time.time()

    if plot_anomaly_score:
        plt.subplot(211)
        plt.title("Anomaly Scores")
        plt.plot(anomaly_scores)
        plt.ylim([.99,1])
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)   
        plt.axvline(ts_obj.get_probationary_index(), color="black", label="probationary line")
        plt.tight_layout()
        plt.show()

    return {"Anomaly Scores": anomaly_scores,
            "Time": end - start}
def matrixprofile(ts_obj,
                  subseq_len,
                  gaussian_window_size,
                  step_size,
                  plot_matrixprofile=False,
                  plot_anomaly_score=False):
    start = time.time()

    # see line 53
    # https://github.com/target/matrixprofile-ts/blob/master/matrixprofile/matrixProfile.py
    if ts_obj.miss:
        ref_date_range = ch.get_ref_date_range(ts_obj.dataframe,
                                               ts_obj.dateformat,
                                               ts_obj.timestep)
        gaps = ref_date_range[~ref_date_range.isin(ts_obj.
                                                   dataframe["timestamp"])]
        filled_df = ch.fill_df(ts_obj.dataframe, ts_obj.timestep,
                               ref_date_range, "fill_nan")
        # print("NaNs exist?: ",filled_df['value'].isnull().values.any())
        matrix_profile = matrixProfile.stamp(filled_df["value"].values,
                                             subseq_len)
    else:
        matrix_profile = matrixProfile.stamp(ts_obj.dataframe["value"].values,
                                             subseq_len)

    # Append np.nan to Matrix profile to enable plotting against raw data
    matrix_profile = np.append(matrix_profile[0],
                               np.zeros(subseq_len - 1) + np.nan)

    anomaly_scores = ah.determine_anomaly_scores_error(
        matrix_profile, np.zeros_like(matrix_profile), ts_obj.get_length(),
        gaussian_window_size, step_size)

    end = time.time()

    if plot_matrixprofile:
        plt.subplot(211)
        plt.title("Matrix Profile")
        plt.plot(matrix_profile)
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)
        plt.axvline(ts_obj.get_probationary_index(),
                    color="black",
                    label="probationary line")
        plt.tight_layout()
        plt.show()

    if plot_anomaly_score:
        plt.subplot(211)
        plt.title("Anomaly Scores")
        plt.plot(anomaly_scores)
        plt.ylim([.99, 1])
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)
        plt.axvline(ts_obj.get_probationary_index(),
                    color="black",
                    label="probationary line")
        plt.tight_layout()
        plt.show()

    return {
        'Anomaly Scores': anomaly_scores,
        'Time': end - start,
        "Matrix Profile": matrix_profile
    }
Пример #3
0
                    name = ts.name
                    result_128 = joblib.load("stl_scores_" +
                                             str(score_number) +
                                             "_gaussian_window_128_" + name +
                                             "_swindow_" + str(swindow) +
                                             "_sdegree_" + str(sdegree) +
                                             "_twindow_" + str(twindow) +
                                             "_tdegree_" + str(tdegree) +
                                             "_inner_" + str(inner) +
                                             "_outer_" + str(outer))

                    stl_remainder = list(result_128["STL Remainder"])
                    actual = list(ts.dataframe["value"].values)
                    step_size = int(gaussian_window_size / 2)
                    anomaly_scores = ah.determine_anomaly_scores_error(
                        stl_remainder, [0] * ts.get_length(), ts.get_length(),
                        gaussian_window_size, step_size)

                    result_dict = {
                        "Anomaly Scores": anomaly_scores,
                        "Time": result_128["Time"],
                        "STL Remainder": stl_remainder
                    }

                    joblib.dump(
                        result_dict, "stl_scores_" + str(score_number) +
                        "_gaussian_window_" + str(gaussian_window_size) + "_" +
                        name + "_swindow_" + str(swindow) + "_sdegree_" +
                        str(sdegree) + "_twindow_" + str(twindow) +
                        "_tdegree_" + str(tdegree) + "_inner_" + str(inner) +
                        "_outer_" + str(outer))
                mcmc_iteration = 10
                latent_dim = 5

                name = ts.name
                result_128 = joblib.load("vae_scores_" + str(score_number) +
                                         "_gaussian_window_128_" + name +
                                         "_window_size_" + str(window_size) +
                                         "_mcmc_iteration_" +
                                         str(mcmc_iteration) + "_latent_dim_" +
                                         str(latent_dim))

                reconstruction_probabilities = list(
                    result_128["Reconstruction Probabilities"])
                step_size = int(gaussian_window_size / 2)
                anomaly_scores = ah.determine_anomaly_scores_error(
                    reconstruction_probabilities,
                    np.zeros_like(reconstruction_probabilities),
                    ts.get_length(), gaussian_window_size, step_size)

                result_dict = {
                    "Anomaly Scores": anomaly_scores,
                    "Time": result_128["Time"],
                    "Reconstruction Probabilities":
                    reconstruction_probabilities
                }

                joblib.dump(
                    result_dict,
                    "vae_scores_" + str(score_number) + "_gaussian_window_" +
                    str(gaussian_window_size) + "_" + name + "_window_size_" +
                    str(window_size) + "_mcmc_iteration_" +
                    str(mcmc_iteration) + "_latent_dim_" + str(latent_dim))
for f in listdir(mypath):
	for score_number in range(0,num_scores):
		for gaussian_window_size in gaussian_window_sizes:
			if "ts_object" in f:
				ts = joblib.load(mypath + f)

				if ts.seasonality:
					subseq_len = ts.period
				else:
					subseq_len = 100
				if subseq_len < 5:
					subseq_len = 100
				name = ts.name
				result_128 = joblib.load("matrix_profile_scores_" + str(score_number) + "_gaussian_window_128_" + name + "_subseq_len_" + str(subseq_len))

				matrix_profile = list(result_128["Matrix Profile"])
				step_size = int(gaussian_window_size/2)

				anomaly_scores = ah.determine_anomaly_scores_error(matrix_profile, np.zeros_like(matrix_profile), ts.get_length(), gaussian_window_size, step_size)

				result_dict = {"Anomaly Scores": anomaly_scores,
							   "Time": result_128["Time"],
							   "Matrix Profile": matrix_profile}

				joblib.dump(result_dict, "matrix_profile_scores_" + str(score_number) + "_gaussian_window_" + str(gaussian_window_size) + "_" +  name + "_subseq_len_" + str(subseq_len))





                ts = joblib.load(mypath + f)
                if ts.name == all_datasets[dataset_index]:
                    name = ts.name

                    result_128 = joblib.load("sarimax_scores_" +
                                             str(score_number) +
                                             "_gaussian_window_128_" + name +
                                             "_step_size_" + str(64))

                    forecast = list(result_128["Forecast"])
                    actual = list(ts.dataframe["value"].values)

                    step_size = int(gaussian_window_size / 2)

                    anomaly_scores = ah.determine_anomaly_scores_error(
                        actual, forecast, len(forecast), gaussian_window_size,
                        step_size)
                    # plt.plot(anomaly_scores)
                    # plt.show()

                    result_dict = {
                        "Anomaly Scores": anomaly_scores,
                        "Time": result_128["Time"],
                        "Predictions": forecast
                    }

                    joblib.dump(
                        result_dict, "sarimax_scores_" + str(score_number) +
                        "_gaussian_window_" + str(gaussian_window_size) + "_" +
                        name + "_step_size_" + str(step_size))
mypath = "../jair_work_step_one_determine_characteristics/"
for f in listdir(mypath):
	for score_number in range(0,num_scores):
		for gaussian_window_size in gaussian_window_sizes:
			if "ts_object" in f:
				ts = joblib.load(mypath + f)

				best_df = glim_grid_search_df.loc[[glim_grid_search_df.loc[glim_grid_search_df["TS Name"] == ts.name, 'RMSE'].idxmin()]]
				lambda_ = best_df["Lambda"].values[0]
				eta = best_df["Eta"].values[0]
				family = best_df["Family"].values[0]
				name = ts.name

				result_128 = joblib.load("glim_scores_" + str(score_number) + "_gaussian_window_128_" + name + "_lambda_" + str(lambda_) + "_eta_" + str(eta) + "_family_" + family)

				predictions = list(result_128["Predictions"])
				actual = list(ts.dataframe["value"].values)
				step_size = int(gaussian_window_size/2)
				anomaly_scores = ah.determine_anomaly_scores_error(actual, predictions, ts.get_length(), gaussian_window_size, step_size)

				result_dict = {"Anomaly Scores": anomaly_scores,
							   "Time": result_128["Time"],
							   "Predictions": predictions}

				joblib.dump(result_dict, "glim_scores_" + str(score_number) + "_gaussian_window_" + str(gaussian_window_size) + "_" + name + "_lambda_" + str(lambda_) + "_eta_" + str(eta) + "_family_" + family)





Пример #8
0
def stl(ts_obj,
        gaussian_window_size,
        step_size,
        swindow,
        sdegree,
        twindow,
        tdegree,
        inner,
        outer,
        grid_search_mode=False,
        plot_components=False,
        plot_anomaly_score=False):
    # this method can deal with missing time steps
    if ts_obj.get_period() < 4:
        raise ValueError("n_periods must be at least 4.")
    start = time.time()

    if ts_obj.miss:
        ref_date_range = ch.get_ref_date_range(ts_obj.dataframe,
                                               ts_obj.get_dateformat(),
                                               ts_obj.get_timestep())
        nan_data = ch.fill_df(ts_obj.dataframe,
                              ts_obj.get_timestep,
                              ref_date_range,
                              method="fill_nan")

        ts_values = nan_data["value"].values
        ts_timestamps = list(nan_data["timestamp"].values)
        ts_timestamps = np.array([str(item) for item in ts_timestamps])

        result = stlplus(ts_values, ts_timestamps, ts_obj.get_period(),
                         swindow, sdegree, twindow, tdegree, inner, outer)
        stl_remainder = list(list(result)[0]["remainder"])

        nan_data["stl remainder"] = stl_remainder
        nan_data = nan_data.dropna()
        stl_remainder = nan_data["stl remainder"].values

    else:

        ts_values = ts_obj.dataframe["value"].values
        ts_timestamps = list(ts_obj.dataframe["timestamp"].values)
        ts_timestamps = np.array([str(item) for item in ts_timestamps])
        result = stlplus(ts_values, ts_timestamps, ts_obj.get_period(),
                         swindow, sdegree, twindow, tdegree, inner, outer)
        stl_remainder = list(result)[0]["remainder"]

    if list(stl_remainder).count(0) >= int(.9 * ts_obj.get_length()):
        raise ValueError("Remainders are mostly zero")

    if grid_search_mode:
        if plot_components:
            # print(list(result)[0]["remainder"].values)
            plt.subplot(311)
            plt.title("Seasonality")
            plt.plot(list(result)[0]["seasonal"].values)
            plt.subplot(312)
            plt.title("Trend")
            plt.plot(list(result)[0]["trend"].values)
            plt.subplot(313)
            plt.title("remainder")
            plt.plot(list(result)[0]["remainder"].values)
            plt.show()
        the_sum = 0
        for remainder in stl_remainder:
            the_sum += abs(remainder)
        print("Sum of STL Remainders: ", the_sum)
        return the_sum

    anomaly_scores = ah.determine_anomaly_scores_error(
        stl_remainder, [0] * ts_obj.get_length(), ts_obj.get_length(),
        gaussian_window_size, step_size)

    end = time.time()

    if plot_components:
        # print(list(result)[0]["remainder"].values)
        plt.subplot(311)
        plt.title("Seasonality")
        plt.plot(list(result)[0]["seasonal"].values)
        plt.subplot(312)
        plt.title("Trend")
        plt.plot(list(result)[0]["trend"].values)
        plt.subplot(313)
        plt.title("remainder")
        plt.plot(list(result)[0]["remainder"].values)
        plt.tight_layout()
        plt.show()

    if plot_anomaly_score:
        plt.subplot(211)
        plt.title("Anomaly Scores")
        plt.plot(anomaly_scores)
        plt.ylim([.99, 1])
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)
        plt.axvline(ts_obj.get_probationary_index(),
                    color="black",
                    label="probationary line")
        plt.tight_layout()
        plt.show()

    return {
        "Anomaly Scores": anomaly_scores,
        "Time": end - start,
        "STL Remainder": stl_remainder
    }
def fbprophet(ts_obj,
              gaussian_window_size,
              step_size,
              changepoint_prior_scale=.05,
              growth='linear',
              yearly_seasonality='auto',
              weekly_seasonality='auto',
              daily_seasonality='auto',
              holidays=None,
              seasonality_mode='additive',
              seasonality_prior_scale=10,
              holidays_prior_scale=10,
              plot_anomaly_score=False,
              plot_forecast=False,
              grid_search_mode=False):

    start = time.time()

    fb_prophet_model = Prophet(changepoint_prior_scale=changepoint_prior_scale,
                               growth=growth,
                               yearly_seasonality=yearly_seasonality,
                               weekly_seasonality=weekly_seasonality,
                               daily_seasonality=daily_seasonality,
                               holidays=holidays,
                               seasonality_mode=seasonality_mode,
                               seasonality_prior_scale=seasonality_prior_scale,
                               holidays_prior_scale=holidays_prior_scale)
    if ts_obj.miss:
        # https://facebook.github.io/prophet/docs/outliers.html
        # Prophet has no problem with missing data
        # You set the missing values to NaNs in the training data
        # But you LEAVE the dates in the prediction
        ref_date_range = ch.get_ref_date_range(ts_obj.dataframe,
                                               ts_obj.dateformat,
                                               ts_obj.timestep)
        data_copy = copy.deepcopy(ts_obj.dataframe)
        data_copy["timestamp"] = pd.to_datetime(data_copy["timestamp"],
                                                format=ts_obj.dateformat)
        data_copy.set_index('timestamp', inplace=True)
        data_copy = data_copy.reindex(ref_date_range, fill_value=np.nan)
        # use entire time series for training
        counts = [i for i in range(len(data_copy))]
        fb_df_train = pd.DataFrame({
            "count": counts,
            "ds": ref_date_range,
            "y": data_copy["value"]
        })
    else:
        # use entire time series for training
        fb_df_train = pd.DataFrame({
            "ds": ts_obj.dataframe["timestamp"],
            "y": ts_obj.dataframe["value"]
        })

    fb_prophet_model.fit(fb_df_train, verbose=False)

    # periods=how much further you want to extend from the training dataset
    # this is not periodicity relating to seasonality
    future = fb_prophet_model.make_future_dataframe(periods=0,
                                                    freq=ts_obj.timestep)
    # make a forecast over the entire time series
    fcst = fb_prophet_model.predict(future)

    predictions = fcst["yhat"].values

    # get RMSE
    if grid_search_mode:
        if ts_obj.miss:
            # remove the predictions from missing time steps
            inds = fb_df_train.loc[
                pd.isna(fb_df_train["y"]), :]["count"].values
            print(inds)
            nonmissing_predictions = []
            for i in range(len(predictions)):
                if i not in inds:
                    nonmissing_predictions.append(predictions[i])
            rmse = mean_squared_error(ts_obj.dataframe["value"].values,
                                      nonmissing_predictions,
                                      squared=False)
            print("RMSE: ", rmse)

        else:
            rmse = mean_squared_error(ts_obj.dataframe["value"].values,
                                      predictions,
                                      squared=False)
            print("RMSE: ", rmse)
        return rmse

    # get anomaly scores
    else:

        if ts_obj.miss:
            # you HAVE to interpolate to get a gaussian window
            new_ts_obj = copy.deepcopy(ts_obj)
            new_ts_obj.set_miss(fill=True)
            actual = list(new_ts_obj.dataframe["value"])
        else:
            actual = ts_obj.dataframe["value"]

        anomaly_scores = ah.determine_anomaly_scores_error(
            actual, predictions, ts_obj.get_length(), gaussian_window_size,
            step_size)

        end = time.time()

        if plot_forecast:
            plt.plot([i for i in range(len(fcst))], fcst["yhat"])
            plt.fill_between([i for i in range(len(fcst))],
                             fcst["yhat_lower"],
                             fcst["yhat_upper"],
                             facecolor='blue',
                             alpha=.3)
            if ts_obj.miss:
                plt.plot([i for i in range(len(predictions))],
                         data_copy["value"],
                         alpha=.5)
            else:
                plt.plot([i for i in range(len(predictions))],
                         ts_obj.dataframe["value"],
                         alpha=.5)
            plt.xticks(rotation=90)
            plt.show()

        if plot_anomaly_score:
            plt.subplot(211)
            plt.title("Anomaly Scores")
            plt.plot(anomaly_scores)
            plt.ylim([.99, 1])
            plt.subplot(212)
            plt.title("Time Series")
            plt.plot(ts_obj.dataframe["value"].values)
            plt.axvline(ts_obj.get_probationary_index(),
                        color="black",
                        label="probationary line")
            plt.tight_layout()
            plt.show()

        return {
            "Anomaly Scores": anomaly_scores,
            "Time": end - start,
            "Predictions": predictions
        }
Пример #10
0
def vae_donut(ts_obj,
              window_size,
              mcmc_iteration,
              latent_dim,
              gaussian_window_size,
              step_size,
              plot_reconstruction=False,
              plot_anomaly_score=False):
    # authors use window_size = 120
    # mcmc_iteration = 10

    # https://github.com/kratzert/finetune_alexnet_with_tensorflow/issues/8
    tf.reset_default_graph()

    start = time.time()

    # if there are missing time steps, we DO NOT fill them with NaNs because donut will replace them with 0s
    # using complete_timestamp
    # see line 6 in https://github.com/NetManAIOps/donut/blob/master/donut/preprocessing.py
    timestamp, values, labels = ts_obj.dataframe[
        "timestamp"].values, ts_obj.dataframe["value"].values, np.zeros_like(
            ts_obj.dataframe["value"].values, dtype=np.int32)

    # print(len(timestamp))
    # print(len(values))
    # print(len(labels))

    # Complete the timestamp, and obtain the missing point indicators
    # replaces  missing with 0s.

    # donut cannot handle this date format for some reason
    if ts_obj.dateformat == "%Y-%m":
        rng = pd.date_range('2000-01-01', periods=len(values), freq='T')
        timestamp, missing, (values, labels) = complete_timestamp(
            rng, (values, labels))
    else:
        timestamp, missing, (values, labels) = complete_timestamp(
            timestamp, (values, labels))

    # print(len(timestamp))
    # print(len(values))
    # print(len(labels))
    # print(sum(missing))

    # Standardize the training and testing data.
    values, mean, std = standardize_kpi(values,
                                        excludes=np.logical_or(
                                            labels, missing))

    with tf.variable_scope('model') as model_vs:
        model = Donut(
            h_for_p_x=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            h_for_q_z=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            x_dims=window_size,
            z_dims=latent_dim,
        )

        trainer = DonutTrainer(model=model, model_vs=model_vs)
        predictor = DonutPredictor(model)

        with tf.Session().as_default():
            trainer.fit(values, labels, missing, mean, std)
            score = predictor.get_score(values, missing)

            # if time series is [1,2,3,4...] and ts_length is 3
            # this gives us [[1,2,3],[2,3,4]...]
            ts_strided = ah.as_sliding_window(values, window_size)
            ts_strided = my_func_float(np.array(ts_strided, dtype=np.float32))
            missing_strided = ah.as_sliding_window(missing, window_size)
            missing_strided = my_func_int(
                np.array(missing_strided, dtype=np.int32))

            # print(ts_strided)
            # print(missing_strided)

            x = model.vae.reconstruct(
                iterative_masked_reconstruct(reconstruct=model.vae.reconstruct,
                                             x=ts_strided,
                                             mask=missing_strided,
                                             iter_count=mcmc_iteration,
                                             back_prop=False))

            # `x` is a :class:`tfsnippet.stochastic.StochasticTensor`, from which
            # you may derive many useful outputs, for example:
            # print(x.tensor.eval())  # the `x` samples
            # print(x.log_prob(group_ndims=0).eval())  # element-wise log p(x|z) of sampled x
            # print(x.distribution.log_prob(ts_strided).eval())  # the reconstruction probability
            # print(x.distribution.mean.eval(), x.distribution.std.eval())  # mean and std of p(x|z)

            tensor_reconstruction_probabilities = x.distribution.log_prob(
                ts_strided).eval()

            # because of the way strided works, we use the first 120 anomaly scores in the first slide
            # and then for remaining slides, we use the last point/score
            reconstruction_probabilities = list(
                tensor_reconstruction_probabilities[0])
            for i in range(len(tensor_reconstruction_probabilities)):
                if i != 0:
                    slide = tensor_reconstruction_probabilities[i]
                    reconstruction_probabilities.append(slide[-1])

    # print(len(reconstruction_probabilities))
    # print(len(ts_obj.dataframe))

    if ts_obj.miss:
        ref_date_range = ch.get_ref_date_range(ts_obj.dataframe,
                                               ts_obj.dateformat,
                                               ts_obj.timestep)
        gaps = ref_date_range[~ref_date_range.isin(ts_obj.
                                                   dataframe["timestamp"])]
        filled_df = ch.fill_df(ts_obj.dataframe, ts_obj.timestep,
                               ref_date_range, "fill_nan")
        # print("NaNs exist?: ",filled_df['value'].isnull().values.any())
        filled_df[
            "reconstruction_probabilities"] = reconstruction_probabilities
        # remove nans
        filled_df = filled_df.dropna()
        reconstruction_probabilities = list(
            filled_df["reconstruction_probabilities"].values)

    # print(len(reconstruction_probabilities))
    # print(len(ts_obj.dataframe))

    reconstruction_probabilities = [
        abs(item) for item in reconstruction_probabilities
    ]

    anomaly_scores = anomaly_scores = ah.determine_anomaly_scores_error(
        reconstruction_probabilities,
        np.zeros_like(reconstruction_probabilities), ts_obj.get_length(),
        gaussian_window_size, step_size)

    end = time.time()

    if plot_reconstruction:
        plt.subplot(211)
        # see lines 98 to 100 of https://github.com/NetManAIOps/donut/blob/master/donut/prediction.py
        plt.title("Negative of Reconstruction Probabilities")
        plt.plot(reconstruction_probabilities)
        # plt.ylim([.99,1])
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)
        plt.axvline(ts_obj.get_probationary_index(),
                    color="black",
                    label="probationary line")
        plt.tight_layout()
        plt.show()

    if plot_anomaly_score:
        plt.subplot(211)
        plt.title("Anomaly Scores")
        plt.plot(anomaly_scores)
        plt.ylim([.998, 1])
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)
        plt.axvline(ts_obj.get_probationary_index(),
                    color="black",
                    label="probationary line")
        plt.tight_layout()
        plt.show()

    return {
        "Anomaly Scores": anomaly_scores,
        "Time": end - start,
        "Reconstruction Probabilities": reconstruction_probabilities
    }
def sarimax_mini(ts_obj,
                 gaussian_window_size,
                 step_size,
                 plot_anomaly_score=False,
                 plot_forecast=False):

    start = time.time()

    # SARIMAX SHOULD BE ABLE TO HANDLE MISSING VALUES theoretically and code wise:
    # --theoretically: https://stats.stackexchange.com/questions/346225/fitting-arima-to-time-series-with-missing-values
    # "ARIMA models are state space models and the Kalman filter,
    # which is used to fit state space models, deals with missing values exactly
    # by simply skipping the update phase."
    # --code wise: https://www.statsmodels.org/devel/examples/notebooks/generated/statespace_sarimax_internet.html
    # "The novel feature is the ability of the model to work on datasets with missing values."

    # Unfortunately, python pyramid autoarima cannot handle missing values
    # I tested this using
    '''
        import pmdarima as pm
        from pmdarima import model_selection
        import numpy as np
        data = pm.datasets.load_wineind()
        data[50] = np.nan
        data[160] = np.nan
        train, test = model_selection.train_test_split(data, train_size=150)
        arima = pm.auto_arima(train, error_action='ignore', trace=True,
            suppress_warnings=True, maxiter=10,seasonal=True, m=12)
        '''
    # which resulted in ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

    # R's autoarima can handle missing values BUT
    # you can force a seasonality with python pyramid with a specified periodicity
    # you cannot force a seasonality in R autoarima
    # I have tried upping parameters like in:
    # https://stackoverflow.com/questions/24390859/why-does-auto-arima-drop-my-seasonality-component-when-stepwise-false-and-approx
    # but it does not work

    # I will instead fill in missing values using interpolation and use python pyramid and FIX s
    # this will give me p,d,q,P,D,Q
    # then I will fill in missing values with NaNs and then use Statsmodels' fit

    if ts_obj.miss:
        ref_date_range = ch.get_ref_date_range(ts_obj.dataframe,
                                               ts_obj.dateformat,
                                               ts_obj.timestep)
        gaps = ref_date_range[~ref_date_range.isin(ts_obj.
                                                   dataframe["timestamp"])]

        filled_df_value = ch.fill_df(ts_obj.dataframe, ts_obj.timestep,
                                     ref_date_range, "fill_value")
        filled_df_value = pd.DataFrame({
            "timestamp": filled_df_value.index,
            "value": filled_df_value["value"]
        })
        endogenous_values_filled_interpolate = filled_df_value.set_index(
            'timestamp')['value']

        filled_df_nan = ch.fill_df(ts_obj.dataframe, ts_obj.timestep,
                                   ref_date_range, "fill_nan")
        endogenous_values_filled_nan = filled_df_nan.set_index(
            'timestamp')['value']

        exogenous_values_filled_interpolate = ah.get_exogenous(
            endogenous_values_filled_interpolate,
            ts_obj.get_dateformat()).drop("Intercept", axis=1, errors='ignore')
        exogenous_values_filled_nan = ah.get_exogenous(
            endogenous_values_filled_nan,
            ts_obj.get_dateformat()).drop("Intercept", axis=1, errors='ignore')

        # fill NaNs with values using interpolation to use Pyramid
        try:
            arima = pm.auto_arima(
                y=endogenous_values_filled_interpolate,
                exogenous=exogenous_values_filled_interpolate,
                max_p=3,
                max_q=3,
                error_action='ignore',
                trace=True,
                suppress_warnings=True,
                maxiter=1,
                maxorder=5,
                seasonal=ts_obj.seasonality,
                m=ts_obj.period)
        # http://alkaline-ml.com/pmdarima/seasonal-differencing-issues.html
        except ValueError:
            arima = pm.auto_arima(
                y=endogenous_values_filled_interpolate,
                exogenous=exogenous_values_filled_interpolate,
                D=0,
                max_p=3,
                max_q=3,
                error_action='ignore',
                trace=True,
                suppress_warnings=True,
                maxiter=1,
                maxorder=5,
                seasonal=ts_obj.seasonality,
                m=ts_obj.period)

        order = arima.order
        seasonal_order = arima.seasonal_order

        # order = (2,1,2)
        # seasonal_order = (0,0,1,3)

        # print(order)
        # print(seasonal_order)

        # use NaNs with sarimax fitting with statsmodels
        try:
            fit_result = sm.tsa.SARIMAX(endogenous_values_filled_nan,
                                        exogenous_values_filled_nan,
                                        order=order,
                                        seasonal_order=seasonal_order,
                                        time_varying=True,
                                        mle_regression=False).fit()
        # https://github.com/statsmodels/statsmodels/issues/5459
        # https://github.com/statsmodels/statsmodels/issues/5374
        except np.linalg.LinAlgError as err:
            print("\n\n!!!!")
            print("enforce_stationarity = False")
            fit_result = sm.tsa.SARIMAX(endogenous_values_filled_nan,
                                        exogenous_values_filled_nan,
                                        order=order,
                                        seasonal_order=seasonal_order,
                                        time_varying=True,
                                        mle_regression=False,
                                        enforce_stationarity=False).fit()

        model = sm.tsa.SARIMAX(endogenous_values_filled_nan,
                               exogenous_values_filled_nan,
                               order=order,
                               seasonal_order=seasonal_order,
                               time_varying=True,
                               mle_regression=False)
        model.initialize_known(fit_result.filtered_state[..., -1],
                               fit_result.filtered_state_cov[..., -1])

        model.update(model.start_params)

        filter_result = model.ssm.filter()
        response = filter_result.forecasts.squeeze(0)

        # print(len(ts_obj.dataframe["value"]))
        # print(len(response))

        filled_df_nan["response"] = response
        filled_df_nan = filled_df_nan.dropna()
        response = filled_df_nan["response"].values

        # print(len(ts_obj.dataframe["value"]))
        # print(len(response))

        anomaly_scores = ah.determine_anomaly_scores_error(
            ts_obj.dataframe["value"], response, len(response),
            gaussian_window_size, step_size)
    else:
        endogenous_values = ts_obj.dataframe.set_index('timestamp')['value']
        exogenous_values = ah.get_exogenous(endogenous_values,
                                            ts_obj.get_dateformat()).drop(
                                                "Intercept",
                                                axis=1,
                                                errors='ignore')

    try:
        arima = pm.auto_arima(y=endogenous_values,
                              exogenous=exogenous_values,
                              max_p=3,
                              max_q=3,
                              error_action='ignore',
                              trace=True,
                              suppress_warnings=True,
                              maxiter=1,
                              maxorder=5,
                              seasonal=ts_obj.seasonality,
                              m=ts_obj.period)
    # http://alkaline-ml.com/pmdarima/seasonal-differencing-issues.html
    except ValueError:
        arima = pm.auto_arima(y=endogenous_values,
                              exogenous=exogenous_values,
                              D=0,
                              max_p=3,
                              max_q=3,
                              error_action='ignore',
                              trace=True,
                              suppress_warnings=True,
                              maxiter=1,
                              maxorder=5,
                              seasonal=ts_obj.seasonality,
                              m=ts_obj.period)

    order = arima.order
    seasonal_order = arima.seasonal_order

    # print("!!!!!!!!!!!!!!!")
    # print(ts_obj.name)
    # print(order)
    # print(seasonal_order)
    # print("!!!!!!!!!!!!!!!")

    try:
        fit_result = sm.tsa.SARIMAX(endogenous_values,
                                    exogenous_values,
                                    order=order,
                                    seasonal_order=seasonal_order,
                                    time_varying=True,
                                    mle_regression=False).fit()
        model = sm.tsa.SARIMAX(endogenous_values,
                               exogenous_values,
                               order=order,
                               seasonal_order=seasonal_order,
                               time_varying=True,
                               mle_regression=False)
    except np.linalg.LinAlgError as err:
        print("\n\n!!!!")
        print("enforce_stationarity = False")
        fit_result = sm.tsa.SARIMAX(endogenous_values,
                                    exogenous_values,
                                    order=order,
                                    seasonal_order=seasonal_order,
                                    time_varying=True,
                                    mle_regression=False,
                                    enforce_stationarity=False,
                                    simple_differencing=True).fit()
        model = sm.tsa.SARIMAX(endogenous_values,
                               exogenous_values,
                               order=order,
                               seasonal_order=seasonal_order,
                               time_varying=True,
                               mle_regression=False,
                               enforce_stationarity=False,
                               simple_differencing=True)

    model.initialize_known(fit_result.filtered_state[..., -1],
                           fit_result.filtered_state_cov[..., -1])
    model.update(model.start_params)
    filter_result = model.ssm.filter()
    response = filter_result.forecasts.squeeze(0)

    anomaly_scores = ah.determine_anomaly_scores_error(endogenous_values,
                                                       response, len(response),
                                                       gaussian_window_size,
                                                       step_size)

    end = time.time()

    if plot_forecast:
        plt.plot(response, alpha=.7, label="Predictions")
        plt.plot(ts_obj.dataframe["value"].values, alpha=.5, label="Data")
        plt.legend()
        plt.show()

    if plot_anomaly_score:
        plt.subplot(211)
        plt.title("Anomaly Scores")
        plt.plot(anomaly_scores)
        plt.ylim([.99, 1])
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)
        plt.axvline(ts_obj.get_probationary_index(),
                    color="black",
                    label="probationary line")
        plt.tight_layout()
        plt.show()

    return {
        "Anomaly Scores": np.asarray(anomaly_scores),
        "Time": end - start,
        "Forecast": response
    }