示例#1
0
def get_data():
    values = []
    labels = []
    timestamp = []
    for i in range(len(raw_data['value1'])):
        values.append(raw_data['value1'][i])
        labels.append(raw_data['label1'][i])
        timestamp.append(i)
    values, labels, timestamp = np.array(values), np.array(labels), np.array(
        timestamp)

    # Complete the timestamp, and obtain the missing point indicators.
    timestamp, missing, (values, labels) = \
        complete_timestamp(timestamp, (values, labels))

    # Split the training and testing data.
    test_portion = 0.2
    test_n = int(len(values) * test_portion)
    train_values, test_values = values[:-test_n], values[-test_n:]
    train_labels, test_labels = labels[:-test_n], labels[-test_n:]
    train_missing, test_missing = missing[:-test_n], missing[-test_n:]

    # Standardize the training and testing data.
    train_values, mean, std = standardize_kpi(train_values,
                                              excludes=np.logical_or(
                                                  train_labels, train_missing))
    test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std)
    return train_values, train_labels, train_missing, mean, std, test_values, test_labels, test_missing
示例#2
0
 def predict(self, X: pd.DataFrame):
     """Since we predict the anomaly scores for each feature independently, we already return a binarized one-
     dimensional anomaly score array."""
     with self.device:
         test_scores = np.zeros_like(X)
         for col_idx, col in enumerate(X.columns):
             mean, std, tf_session, model = \
                 self.means[col_idx], self.stds[col_idx], self.tf_sessions[col_idx], self.models[col_idx]
             test_values, _, _ = standardize_kpi(X.loc[:, col], mean=mean, std=std)
             test_missing = np.zeros_like(test_values)
             predictor = DonutPredictor(model)
             with tf_session.as_default():
                 test_score = predictor.get_score(test_values, test_missing)
             # Convert to negative reconstruction probability so score is in accordance with other detectors
             test_score = -np.power(np.e, test_score)
             test_scores[self.x_dims - 1:, col_idx] = test_score
         aggregated_test_scores = np.amax(test_scores, axis=1)
         aggregated_test_scores[:self.x_dims - 1] = np.nanmin(aggregated_test_scores) - sys.float_info.epsilon
         return aggregated_test_scores
示例#3
0
    def fit(self, X: pd.DataFrame):
        with self.device:
            # Reset all results from last run to avoid reusing variables
            self.means, self.stds, self.tf_sessions, self.models = [], [], [], []
            for col_idx in trange(len(X.columns)):
                col = X.columns[col_idx]
                tf_session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
                timestamps = X.index
                features = X.loc[:, col].interpolate().bfill().values
                labels = pd.Series(0, X.index)
                timestamps, _, (features, labels) = complete_timestamp(timestamps, (features, labels))
                missing = np.isnan(X.loc[:, col].values)
                _, mean, std = standardize_kpi(features, excludes=np.logical_or(labels, missing))

                with tf.variable_scope('model') as model_vs:
                    model = DonutModel(
                        h_for_p_x=Sequential([
                            K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                                           activation=tf.nn.relu),
                            K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                                           activation=tf.nn.relu),
                        ]),
                        h_for_q_z=Sequential([
                            K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                                           activation=tf.nn.relu),
                            K.layers.Dense(100, kernel_regularizer=K.regularizers.l2(0.001),
                                           activation=tf.nn.relu),
                        ]),
                        x_dims=self.x_dims,
                        z_dims=5,
                    )

                trainer = QuietDonutTrainer(model=model, model_vs=model_vs, max_epoch=self.max_epoch,
                                            batch_size=self.batch_size, valid_batch_size=self.batch_size,
                                            missing_data_injection_rate=0.0, lr_anneal_factor=1.0)
                with tf_session.as_default():
                    trainer.fit(features, labels, missing, mean, std, valid_portion=0.25)
                self.means.append(mean)
                self.stds.append(std)
                self.tf_sessions.append(tf_session)
                self.models.append(model)
示例#4
0
    timestamp, missing, (values,
                         labels) = complete_timestamp(timestamp,
                                                      (values, labels))
    train_values = values
    train_labels = labels
    train_missing = missing
    # Split the training and testing data.
    test_portion = 0.3
    test_n = int(len(values) * test_portion)
    train_values, test_values = values[:-test_n], values[-test_n:]
    train_labels, test_labels = labels[:-test_n], labels[-test_n:]
    train_missing, test_missing = missing[:-test_n], missing[-test_n:]

    # Standardize the training and testing data.
    train_values, mean, std = standardize_kpi(train_values,
                                              excludes=np.logical_or(
                                                  train_labels, train_missing))
    test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std)

    # We build the entire model within the scope of `model_vs`,
    # it should hold exactly all the variables of `model`, including
    # the variables created by Keras layers.
    #Using keras to create layer
    #As is shown in follow:
    '''
    Argc:
            h_for_p_x (Module or (tf.Tensor) -> tf.Tensor):
                The hidden network for :math:`p(x|z)`.
            h_for_q_z (Module or (tf.Tensor) -> tf.Tensor):
                The hidden network for :math:`q(z|x)`.
            x_dims (int): The number of `x` dimensions.
示例#5
0
def donut_test(src_dir, output_dir, file, batch):
    if os.path.exists(output_dir + "performance-donut-" + str(batch) + ".csv"):
        perform = pd.read_csv(output_dir + "performance-donut-" + str(batch) +
                              ".csv")
    else:
        perform = pd.DataFrame({
            "file": [],
            "storage": [],
            "train-time": [],
            "codisp-time": [],
            "test-time": [],
            "precision": [],
            "recall": [],
            "best-F1": [],
            "best-threshold": []
        })
    perform = perform.append([{
        'file': file,
        "storage": 0.0,
        "train-time": 0.0,
        "codisp-time": 0.0,
        "test-time": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "best-F1": 0.0,
        "best-threshold": 0.0
    }],
                             ignore_index=True)
    perform.index = perform["file"]

    data = pd.read_csv(src_dir + file)
    timestamp, value, labels = data["timestamp"], data["value"], data[
        "anomaly"]
    missing = np.zeros(len(timestamp))

    test_portion = 0.5
    test_n = int(len(value) * test_portion)
    train_values, test_values = value[:-test_n], value[-test_n:]
    train_labels, test_labels = labels[:-test_n], labels[-test_n:]
    train_time, test_time = timestamp[:-test_n], timestamp[-test_n:]
    train_missing, test_missing = missing[:-test_n], missing[-test_n:]

    train_values, mean, std = standardize_kpi(train_values,
                                              excludes=np.logical_or(
                                                  train_labels, train_missing))
    test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std)

    with tf.variable_scope('model') as model_vs:
        model = Donut(
            h_for_p_x=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            h_for_q_z=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            x_dims=120,
            z_dims=5,
        )
    trainer = DonutTrainer(model=model, model_vs=model_vs)
    predictor = DonutPredictor(model)
    with tf.Session().as_default():
        start = time.time()
        trainer.fit(train_values, train_labels, train_missing, mean, std)
        end = time.time()
        perform.loc[file, "train-time"] = end - start

        start = time.time()
        test_score = predictor.get_score(test_values, test_missing)
        end = time.time()
        perform.loc[file, "test-time"] = end - start

    storage = get_size(trainer) + get_size(predictor)
    perform.loc[file, "storage"] = storage

    pd.DataFrame({
        "timestamp": test_time[-len(test_score):],
        "score": test_score
    }).to_csv(output_dir + "test-donut" + file, index=False)
    best_F1, best_threshold, precision, recall = compute_best_F1(
        src_dir + file,
        output_dir + "test-donut" + file,
        reverse=True,
        mean_start=False)
    perform.loc[file, "best-F1"] = best_F1
    perform.loc[file, "best-threshold"] = best_threshold
    perform.loc[file, "precision"] = precision
    perform.loc[file, "recall"] = recall

    perform.to_csv(output_dir + "performance-donut-" + str(batch) + ".csv",
                   index=False)
示例#6
0
def generate_score(number):
    # Read the raw data.
    data_dir_path = 'C:/Users/Administrator/Downloads/research/donut-master/SMD/data_concat/data-' + number + '.csv'
    data = np.array(pd.read_csv(data_dir_path, header=None), dtype=np.float64)
    tag_dir_path = './SMD/test_label/machine-' + number + '.csv'
    tag = np.array(pd.read_csv(tag_dir_path, header=None), dtype=np.int)
    labels = np.append(np.zeros(int(len(data) / 2)), tag)
    # pick one colume
    values = data[:, 1]
    timestamp = np.arange(len(data)) + 1

    # If there is no label, simply use all zeros.
    # labels = np.zeros_like(values, dtype=np.int32)

    # Complete the timestamp, and obtain the missing point indicators.
    timestamp, (values, labels) = \
        complete_timestamp(timestamp, (values, labels))

    # Split the training and testing data.
    test_portion = 0.5
    test_n = int(len(values) * test_portion)
    train_values = values[:-test_n]
    test_values = values[-len(train_values):]
    train_labels, test_labels = labels[:-test_n], labels[-test_n:]
    # print(len(test_values), len(test_labels))

    # Standardize the training and testing data.
    train_values, mean, std = standardize_kpi(train_values,
                                              excludes=train_labels)
    test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std)

    import tensorflow as tf
    from donut import Donut
    from tensorflow import keras as K
    from tfsnippet.modules import Sequential

    # We build the entire model within the scope of `model_vs`,
    # it should hold exactly all the variables of `model`, including
    # the variables created by Keras layers.
    with tf.variable_scope('model') as model_vs:
        model = Donut(
            h_for_p_x=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            h_for_q_z=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            x_dims=120,
            z_dims=5,
        )

    from donut import DonutTrainer, DonutPredictor

    trainer = DonutTrainer(model=model, model_vs=model_vs)
    predictor = DonutPredictor(model)

    with tf.Session().as_default():
        trainer.fit(train_values, train_labels, mean, std)
        test_score = predictor.get_score(test_values)

    if not os.path.exists('./score'):
        os.makedirs('./score')

    np.save('./score/' + number + '.npy', test_score)
示例#7
0
def vae_donut(ts_obj,
              window_size,
              mcmc_iteration,
              latent_dim,
              gaussian_window_size,
              step_size,
              plot_reconstruction=False,
              plot_anomaly_score=False):
    # authors use window_size = 120
    # mcmc_iteration = 10

    # https://github.com/kratzert/finetune_alexnet_with_tensorflow/issues/8
    tf.reset_default_graph()

    start = time.time()

    # if there are missing time steps, we DO NOT fill them with NaNs because donut will replace them with 0s
    # using complete_timestamp
    # see line 6 in https://github.com/NetManAIOps/donut/blob/master/donut/preprocessing.py
    timestamp, values, labels = ts_obj.dataframe[
        "timestamp"].values, ts_obj.dataframe["value"].values, np.zeros_like(
            ts_obj.dataframe["value"].values, dtype=np.int32)

    # print(len(timestamp))
    # print(len(values))
    # print(len(labels))

    # Complete the timestamp, and obtain the missing point indicators
    # replaces  missing with 0s.

    # donut cannot handle this date format for some reason
    if ts_obj.dateformat == "%Y-%m":
        rng = pd.date_range('2000-01-01', periods=len(values), freq='T')
        timestamp, missing, (values, labels) = complete_timestamp(
            rng, (values, labels))
    else:
        timestamp, missing, (values, labels) = complete_timestamp(
            timestamp, (values, labels))

    # print(len(timestamp))
    # print(len(values))
    # print(len(labels))
    # print(sum(missing))

    # Standardize the training and testing data.
    values, mean, std = standardize_kpi(values,
                                        excludes=np.logical_or(
                                            labels, missing))

    with tf.variable_scope('model') as model_vs:
        model = Donut(
            h_for_p_x=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            h_for_q_z=Sequential([
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100,
                               kernel_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            x_dims=window_size,
            z_dims=latent_dim,
        )

        trainer = DonutTrainer(model=model, model_vs=model_vs)
        predictor = DonutPredictor(model)

        with tf.Session().as_default():
            trainer.fit(values, labels, missing, mean, std)
            score = predictor.get_score(values, missing)

            # if time series is [1,2,3,4...] and ts_length is 3
            # this gives us [[1,2,3],[2,3,4]...]
            ts_strided = ah.as_sliding_window(values, window_size)
            ts_strided = my_func_float(np.array(ts_strided, dtype=np.float32))
            missing_strided = ah.as_sliding_window(missing, window_size)
            missing_strided = my_func_int(
                np.array(missing_strided, dtype=np.int32))

            # print(ts_strided)
            # print(missing_strided)

            x = model.vae.reconstruct(
                iterative_masked_reconstruct(reconstruct=model.vae.reconstruct,
                                             x=ts_strided,
                                             mask=missing_strided,
                                             iter_count=mcmc_iteration,
                                             back_prop=False))

            # `x` is a :class:`tfsnippet.stochastic.StochasticTensor`, from which
            # you may derive many useful outputs, for example:
            # print(x.tensor.eval())  # the `x` samples
            # print(x.log_prob(group_ndims=0).eval())  # element-wise log p(x|z) of sampled x
            # print(x.distribution.log_prob(ts_strided).eval())  # the reconstruction probability
            # print(x.distribution.mean.eval(), x.distribution.std.eval())  # mean and std of p(x|z)

            tensor_reconstruction_probabilities = x.distribution.log_prob(
                ts_strided).eval()

            # because of the way strided works, we use the first 120 anomaly scores in the first slide
            # and then for remaining slides, we use the last point/score
            reconstruction_probabilities = list(
                tensor_reconstruction_probabilities[0])
            for i in range(len(tensor_reconstruction_probabilities)):
                if i != 0:
                    slide = tensor_reconstruction_probabilities[i]
                    reconstruction_probabilities.append(slide[-1])

    # print(len(reconstruction_probabilities))
    # print(len(ts_obj.dataframe))

    if ts_obj.miss:
        ref_date_range = ch.get_ref_date_range(ts_obj.dataframe,
                                               ts_obj.dateformat,
                                               ts_obj.timestep)
        gaps = ref_date_range[~ref_date_range.isin(ts_obj.
                                                   dataframe["timestamp"])]
        filled_df = ch.fill_df(ts_obj.dataframe, ts_obj.timestep,
                               ref_date_range, "fill_nan")
        # print("NaNs exist?: ",filled_df['value'].isnull().values.any())
        filled_df[
            "reconstruction_probabilities"] = reconstruction_probabilities
        # remove nans
        filled_df = filled_df.dropna()
        reconstruction_probabilities = list(
            filled_df["reconstruction_probabilities"].values)

    # print(len(reconstruction_probabilities))
    # print(len(ts_obj.dataframe))

    reconstruction_probabilities = [
        abs(item) for item in reconstruction_probabilities
    ]

    anomaly_scores = anomaly_scores = ah.determine_anomaly_scores_error(
        reconstruction_probabilities,
        np.zeros_like(reconstruction_probabilities), ts_obj.get_length(),
        gaussian_window_size, step_size)

    end = time.time()

    if plot_reconstruction:
        plt.subplot(211)
        # see lines 98 to 100 of https://github.com/NetManAIOps/donut/blob/master/donut/prediction.py
        plt.title("Negative of Reconstruction Probabilities")
        plt.plot(reconstruction_probabilities)
        # plt.ylim([.99,1])
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)
        plt.axvline(ts_obj.get_probationary_index(),
                    color="black",
                    label="probationary line")
        plt.tight_layout()
        plt.show()

    if plot_anomaly_score:
        plt.subplot(211)
        plt.title("Anomaly Scores")
        plt.plot(anomaly_scores)
        plt.ylim([.998, 1])
        plt.subplot(212)
        plt.title("Time Series")
        plt.plot(ts_obj.dataframe["value"].values)
        plt.axvline(ts_obj.get_probationary_index(),
                    color="black",
                    label="probationary line")
        plt.tight_layout()
        plt.show()

    return {
        "Anomaly Scores": anomaly_scores,
        "Time": end - start,
        "Reconstruction Probabilities": reconstruction_probabilities
    }