def test_zscore_zmax(): """Increasing zmax excludes outliers closest to the mean.""" data = pd.Series([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 10]) assert_series_equal( data[-2:], data[outliers.zscore(data)] ) assert_series_equal( data[-1:], data[outliers.zscore(data, zmax=3)] ) assert (~outliers.zscore(data, zmax=5)).all()
def test_zscore_all_same(): """If all data is identical there are no outliers.""" data = pd.Series([1 for _ in range(20)]) np.seterr(invalid='ignore') assert_series_equal(pd.Series([False for _ in range(20)]), outliers.zscore(data)) np.seterr(invalid='warn')
def test_zscore_outlier_below(): """Correctly idendifies an outlier below the mean.""" data = pd.Series([1, 0, -1, 0, 1, -1, -10]) assert_series_equal( pd.Series([False, False, False, False, False, False, True]), outliers.zscore(data) )
import matplotlib.pyplot as plt import pandas as pd import pathlib # %% # First, we read in the ac_power_inv_7539_outliers example. Min-max normalized # AC power is represented by the "value_normalized" column. There is a boolean # column "outlier" where inserted outliers are labeled as True, and all other # values are labeled as False. These outlier values were inserted manually into # the data set to illustrate outlier detection by each of the functions. # We use a normalized time series example provided by the PV Fleets Initiative. # This example is adapted from the DuraMAT DataHub # clipping data set: # https://datahub.duramat.org/dataset/inverter-clipping-ml-training-set-real-data pvanalytics_dir = pathlib.Path(pvanalytics.__file__).parent ac_power_file = pvanalytics_dir / 'data' / 'ac_power_inv_7539_outliers.csv' data = pd.read_csv(ac_power_file, index_col=0, parse_dates=True) print(data.head(10)) # %% # We then use :py:func:`pvanalytics.quality.outliers.zscore` to identify # outliers in the time series, and plot the data with the z-score outlier mask. zscore_outlier_mask = zscore(data=data['value_normalized']) data['value_normalized'].plot() data.loc[zscore_outlier_mask, 'value_normalized'].plot(ls='', marker='o') plt.legend(labels=["AC Power", "Detected Outlier"]) plt.xlabel("Date") plt.ylabel("Normalized AC Power") plt.tight_layout() plt.show()
def test_zscore_omit_nan_input(): data = pd.Series([1, 0, -1, 0, np.NaN, 1, -1, 10]) assert_series_equal( pd.Series([False, False, False, False, False, False, False, True]), outliers.zscore(outliers.zscore(data, nan_policy='omit')))
def test_zscore_invalid_nan_policy(): data = pd.Series([1, 0, -1, 0, np.NaN, 1, -1, 10]) with pytest.raises(ValueError): outliers.zscore(data, nan_policy='incorrect_str')
def test_zscore_raise_nan_input(): data = pd.Series([1, 0, -1, 0, np.NaN, 1, -1, 10]) with pytest.raises(ValueError): outliers.zscore(data, nan_policy='raise')