def test_hampel_scale(): np.random.seed(1000) data = pd.Series(np.random.uniform(-1, 1, size=100)) data.iloc[20] = -25 data.iloc[40] = 15 data.iloc[60] = 5 assert not all(outliers.hampel(data) == outliers.hampel(data, scale=0.1))
def test_hampel_max_deviation(): """Increasing max_deviation causes fewer values to be identified as outliers.""" np.random.seed(1000) data = pd.Series(np.random.uniform(-1, 1, size=100)) data.iloc[20] = -25 data.iloc[40] = 15 data.iloc[60] = 5 expected = pd.Series(False, index=data.index) expected.iloc[[20, 40, 60]] = True assert_series_equal( data[outliers.hampel(data, window=11)], data[expected] ) expected.iloc[60] = False assert_series_equal( data[outliers.hampel(data, window=11, max_deviation=10)], data[expected] ) expected.iloc[40] = False assert_series_equal( data[outliers.hampel(data, window=11, max_deviation=12)], data[expected] )
def test_hampel_all_same(): """outliers.hampel identifies no outlier if all data is the same.""" data = pd.Series(1, index=range(0, 50)) assert_series_equal( outliers.hampel(data), pd.Series(False, index=range(0, 50)) )
def test_hampel_one_outlier(): """If all data is same but one value outliers.hampel should identify that value as an outlier.""" np.random.seed(1000) data = pd.Series(np.random.uniform(0, 1, size=50)) data.iloc[20] = 10 expected = pd.Series(False, index=data.index) expected.iloc[20] = True assert_series_equal(outliers.hampel(data, window=11), expected)
import pathlib # %% # First, we read in the ac_power_inv_7539_outliers example. Min-max normalized # AC power is represented by the "value_normalized" column. There is a boolean # column "outlier" where inserted outliers are labeled as True, and all other # values are labeled as False. These outlier values were inserted manually into # the data set to illustrate outlier detection by each of the functions. # We use a normalized time series example provided by the PV Fleets Initiative. # This example is adapted from the DuraMAT DataHub # clipping data set: # https://datahub.duramat.org/dataset/inverter-clipping-ml-training-set-real-data pvanalytics_dir = pathlib.Path(pvanalytics.__file__).parent ac_power_file_1 = pvanalytics_dir / 'data' / 'ac_power_inv_7539_outliers.csv' data = pd.read_csv(ac_power_file_1, index_col=0, parse_dates=True) print(data.head(10)) # %% # We then use :py:func:`pvanalytics.quality.outliers.hampel` to identify # outliers in the time series, and plot the data with the hampel outlier mask. hampel_outlier_mask = hampel(data=data['value_normalized'], window=10) data['value_normalized'].plot() data.loc[hampel_outlier_mask, 'value_normalized'].plot(ls='', marker='o') plt.legend(labels=["AC Power", "Detected Outlier"]) plt.xlabel("Date") plt.ylabel("Normalized AC Power") plt.tight_layout() plt.show()