예제 #1
0
    def visualize_features(self, data, file_name, method='TSNE', show=False):
        fig = plt.figure()
        tc.yellow('Visualize features using {}...'.format(method))
        features = data.drop(['is_anomaly', 'window_label'], axis=1)

        if (method == 'TSNE'):
            embedded = TSNE(n_components=2).fit_transform(features)
        elif (method == 'UMAP'):
            embedded = umap.UMAP().fit_transform(features)

        ai = data.index[data.is_anomaly == 1].tolist()
        ni = data.index[data.is_anomaly == 0].tolist()

        normal = plt.scatter(embedded[ni, 0], embedded[ni, 1], c='blue', s=2)
        anomaly = plt.scatter(embedded[ai, 0], embedded[ai, 1], c='red', s=2)

        # Add time window labels to feature plot
        # for i in ai:
        #     wl = data.loc[i].window_label
        #     plt.annotate(
        #         '{} ({})'.format(i, wl),
        #         (embedded[i, 0], embedded[i, 1])
        #     )

        plt.legend((normal, anomaly), ('Normal', 'Anomaly'), loc='lower right')
        plt.title('{} projection of the features\n'.format(method))
        fig.tight_layout()
        file_name = file_name.replace('.csv', '')
        file_path = '{}_{}-features.png'.format(file_name, method)
        fig.savefig(file_path)
        if show:
            plt.show()
        plt.close()
        tc.green('Saved {} visualized features using {}'.format(
            method, file_path))
예제 #2
0
    def generate_features(self,
                          timeseries,
                          anomaly_labels,
                          window_size,
                          file_name,
                          method='ARMA',
                          order=(2, 2),
                          stride=0):
        """Process the complete timeseries. Create windows first and then
        encode each window to reduce the dimensionality.

        Returns
        -------
        features: DataFrame
            List of features with anomaly labels
        """
        if stride == 0:
            stride = window_size / 2

        if (method == 'ARMA'):
            get_parameters = self.get_arma_params
        elif (method == 'ARIMA'):
            get_parameters = self.get_arima_params
        else:
            raise ValueError('Unkown method {}.'.format(method) +
                             'Only ARMA and ARIMA are supported.')

        window_columns = ['window_start', 'window_end', 'is_anomaly']
        windows = pd.DataFrame(columns=window_columns)

        features = pd.DataFrame()
        window_starts = np.arange(0, len(timeseries), step=stride, dtype=int)
        tc.yellow("Generating features...")

        for i, start in enumerate(tqdm(window_starts)):
            end = int(start + window_size - 1)
            window_data = timeseries[start:end]
            window_is_anomaly = min(1, sum(anomaly_labels[start:end]))
            windows.loc[i] = [start, end, window_is_anomaly]

            fitted = get_parameters(window_data, order)
            if i == 0:
                feature_columns = np.append(fitted.data.param_names,
                                            ('is_anomaly', 'window_label'))
                features = pd.DataFrame(columns=feature_columns)
            window_label = '{}-{}'.format(start, end)
            # TODO: add fitted.sigma2
            newRow = np.append(fitted.params,
                               (window_is_anomaly, window_label))
            features.loc[i] = newRow

        features.is_anomaly = features.is_anomaly.astype(int)
        features.to_csv(file_name, index=False)  # Save features to file
        tc.green('Saved features in {}'.format(file_name))
        return pd.read_csv(file_name)
예제 #3
0
def detect_anomalies(train_features, test_features, test_labels):
    regularization_strengths = [0.0, 0.00001, 0.0001, 0.001, 0.01, 0.1]
    regularization_strengths = [0.0001]

    for regularization_strength in regularization_strengths:
        tc.yellow('Running with regularization_strength {}...'.format(
            regularization_strength))

        result_file_name = '{}/anomaly_scores_regularization_{}.csv'.format(
            folder,
            str(regularization_strength).replace('.', '_'))

        encoder.run(train_features, test_features, test_labels,
                    regularization_strength, result_file_name)
예제 #4
0
def detect_anomalies(train_features, test_features, test_labels, out_folder):
    regularization_strengths = [0.0, 0.00001, 0.0001, 0.001, 0.01, 0.1]
    epochs = 100

    for regularization_strength in regularization_strengths:
        tc.yellow('Running with regularization_strength {}...'.format(
            regularization_strength))

        fn = '{}/anomaly_scores_regularization_{}_epochs_{}.csv'.format(
            out_folder,
            str(regularization_strength).replace('.', '_'), epochs)

        encoder.run(train_features, test_features, test_labels,
                    regularization_strength, fn, epochs)
예제 #5
0
    def detect_anomalies(self, X, show=False):
        plot_num = 1
        plt.figure(figsize=(len(self.anomaly_algorithms) * 2 + 3, 6))

        for name, algorithm in self.anomaly_algorithms:
            tc.yellow('Detecting anomalies using {}...'.format(name))
            algorithm.fit(X)
            plt.subplot(1, len(self.anomaly_algorithms), plot_num)
            plt.title(name, size=18)

            # fit the data and tag outliers
            if name == "Local Outlier Factor":
                y_pred = algorithm.fit_predict(X)
            else:
                y_pred = algorithm.fit(X).predict(X)
            # Print and plot
            self.print_anomalies(name, y_pred)
            self.plot_anomalies(name, X, y_pred, plt)
            plot_num += 1
        plt.tight_layout()
        plt.savefig(self.file)
        tc.green('Saved anomaly plot to {}'.format(self.file))
        if show:
            plt.show()
예제 #6
0
    def generate_timeseries(self, show=False, seed=12345):
        np.random.seed(seed)
        """Stitch together two time series with different  ARMA parameters
        to generate one timeseries which contains anomalies.

        Parameters
        ----------
        show : bool
            Show generated data as plots.
        
        Returns
        -------
        stitched_data: array
            Data containing anomalies.
        """
        # Genrate the two timeseries (with different ARMA parameters)
        tc.yellow('Generating normal timeseries...')
        ar, ma = self.arma_generate_params([.75, -.25], [.65, .35])
        default_series = arima.arma_generate_sample(ar, ma, self.nsample)
        default_series = pd.DataFrame(default_series, columns=['value'])
        default_series['is_anomaly'] = int(0)

        tc.yellow('Generating anomaly timeseries...')
        ar, ma = self.arma_generate_params([.75, -.25], [-.65, .35])
        anomaly_series = arima.arma_generate_sample(ar, ma, self.nsample)
        anomaly_series = pd.DataFrame(anomaly_series, columns=['value'])
        anomaly_series['is_anomaly'] = int(1)

        # Plot the two timeseries
        if show:
            self.show_raw_data(default_series, anomaly_series)
        
        tc.yellow(
            'Combining the two timeseries to get one time series'
            'containing anomalies...'
        )
        stitched_data = default_series
        for anomaly in self.anomalies:
            start = anomaly * self.window_size
            end = (anomaly + 1) * self.window_size
            # Inject anomalies
            stitched_data[start : end] =  anomaly_series[start : end]

        self.create_data_plot(stitched_data, show)

        self.save_data(stitched_data)
        return pd.DataFrame(stitched_data) 
예제 #7
0
def run(training_data,
        test_data,
        test_labels,
        regularization_strength,
        file_name,
        epochs=100):
    assert training_data.shape[1] == test_data.shape[1]

    # Train autoencoder network
    encoding_dim = 2
    model = Sequential()
    data_dim = test_data.shape[1]
    layers = [data_dim]
    hidden_dim = int(data_dim / 2)
    # Input layer and first encoding layer
    model.add(
        Dense(hidden_dim,
              input_dim=data_dim,
              activation='relu',
              activity_regularizer=l2(regularization_strength),
              name='encoding_{}'.format(hidden_dim)))
    layers.append(hidden_dim)

    # Add layers with decreasing size
    hidden_dim = int(hidden_dim / 2)
    while encoding_dim <= hidden_dim:
        model.add(
            Dense(hidden_dim,
                  activation='relu',
                  activity_regularizer=l2(regularization_strength),
                  name='encoding_{}'.format(hidden_dim)))
        layers.append(hidden_dim)
        hidden_dim = int(hidden_dim / 2)

    # Add layers with increasing size
    layers.pop()  # remove smallest element
    for hidden_dim in sorted(layers):
        model.add(
            Dense(hidden_dim,
                  activation='relu',
                  activity_regularizer=l2(regularization_strength),
                  name='decoding_{}'.format(hidden_dim)))

    # Output layer
    model.add(Dense(data_dim, name='output'))  # Multiple output neurons
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(training_data, training_data, verbose=1, epochs=epochs)

    # Save network structure to png
    dirname = os.path.dirname(file_name)
    fn = '{}/auto_encoder_model.png'.format(dirname)
    plot_model(model, to_file=fn, show_shapes=True)
    tc.green('Saved model image as {}'.format(fn))

    pred = model.predict(training_data)
    score = np.sqrt(metrics.mean_squared_error(pred, training_data))
    tc.yellow("Training Normal Score (RMSE): {}".format(score))

    pred = model.predict(test_data)
    score = np.sqrt(metrics.mean_squared_error(pred, test_data))
    tc.yellow("Test Normal Score (RMSE): {}".format(score))

    # Predict / create anomaly scores
    scores = []
    tc.yellow('Generating anomaly scores...')
    for feature in tqdm(test_data):
        pred = model.predict(np.array([feature]))
        score = np.sqrt(metrics.mean_squared_error(pred, np.array([feature])))
        scores.append(score)

    # Save scores (anomaly scores)
    df = pd.DataFrame({'anomaly_score': scores, 'is_anomaly': test_labels})
    df.to_csv(file_name, index=False)
    tc.green('Saved file {}'.format(file_name))

    visualize_and_save(scores, test_labels, file_name, regularization_strength)