def get_anom(magnetic, axis): ''' Axis anomaly detection helper function :param magnetic: (dataframe) dataframe of magnetic data (series: X, Y, Z) :param axis: One of the three axes ('X', 'Y', 'Z') :return: Data frame containing timestamps, values for the anomalies in that axis. ''' print("Detecting anomalies for", axis, "axis", end='') start = process_time() # preprocessing data df = magnetic[['Date', axis]] df.columns = ["timestamp", "value"] # using pyculiarity to detect anomalies # TODO: mess around with maximum_anomalies and alpha to improve resulting plots eq_anom = pyc.detect_ts(df, maximum_anomalies=0.025, direction='pos', alpha=0.05) print(" --- took", round(process_time() - start, 2), " s") return eq_anom['anoms']
def transform(self, X): """ This will run the pyculiarity anomaly detection routine on all columns of a dataset. First it is coerced into a pandas DataFrame if it isn't already one, then if there is a specified timestamp index or index col, that is set as the index. Otherwise a naive integer is used. :param X: :return: """ if not isinstance(X, DataFrame): X = DataFrame(X) if self.datetimestr_col is not None: X[self.datetimestr_col] = to_datetime(X[self.datetimestr_col]) X.rename(columns={self.datetimestr_col: '_index'}, inplace=True) elif self.index_col is not None: X.rename(columns={self.index_col: '_index'}, inplace=True) else: X['_index'] = X.index.values for col in X.columns.values: if col is not '_index': df_col = X.reindex(columns=['_index', col]) out = detect_ts(df_col, max_anoms=self.max_anoms, alpha=self.alpha, direction=self.direction, only_last=None) X[col] = 0 X.loc[X['_index'].isin(out['anoms']['timestamp'].values), col] = 1 if self.datetimestr_col is not None: X.rename(columns={'_index': self.datetimestr_col}, inplace=True) elif self.index_col is not None: X.rename(columns={'_index': self.index_col}, inplace=True) else: X.drop(labels=['_index'], inplace=True) return X
def test_both_directions_e_value_threshold_med_max(self): results = detect_ts(self.raw_data, maximum_anomalies=0.02, direction='both', threshold="med_max", e_value=True) eq_(len(results['anoms'].columns), 3) eq_(len(results['anoms'].iloc[:, 1]), 4)
def test_both_directions_e_value_longterm(self): results = detect_ts(self.raw_data, maximum_anomalies=0.02, direction='both', long_term=True, plot=False, e_value=True) eq_(len(results['anoms'].columns), 3) eq_(len(results['anoms'].iloc[:, 1]), 131)
def test_both_directions_with_plot(self): results = detect_ts(self.raw_data, maximum_anomalies=0.02, direction='both', only_last='day', plot=False) eq_(len(results['anoms'].columns), 2) eq_(len(results['anoms'].iloc[:, 1]), 25)
Rosner, B., (May 1983), "Percentage Points for a Generalized ESD Many-Outlier Procedure" , Technometrics, 25(2), pp. 165-172. ''' # First prepare data from truncated series my_df = pd.DataFrame({'timestamp':ts.values, 'observation':ts.index}) results = detect_ts(df=my_df, max_anoms=0.1, direction="pos", alpha=0.05, only_last=None, threshold=None, e_value=False, longterm=False, piecewise_median_period_weeks=2, plot=False, y_log=False, xlabel=XLABEL, ylabel=YLABEL, title='Google Trends Data - Twitter + IQR Method', verbose=False) plt.title(KEYWORD + ' - Google Trends Data - Twitter + GES') #plt.subtitle('United States search volume') plt.xlabel(XLABEL) plt.tick_params(axis='x', rotation=-45) plt.ylabel(YLABEL) plt.tight_layout() plt.autoscale()