def test_diff_detector_threshold(n_features_y: int, n_features_x: int): """ Basic construction logic of thresholds_ attribute in the DiffBasedAnomalyDetector """ X = np.random.random((100, n_features_x)) y = np.random.random((100, n_features_y)) model = DiffBasedAnomalyDetector(base_estimator=MultiOutputRegressor( estimator=LinearRegression())) # Model has own implementation of cross_validate assert hasattr(model, "cross_validate") # When initialized it should not have a threshold calculated. assert not hasattr(model, "feature_thresholds_") assert not hasattr(model, "aggregate_threshold_") assert not hasattr(model, "feature_thresholds_per_fold_") assert not hasattr(model, "aggregate_thresholds_per_fold_") model.fit(X, y) # Until it has done cross validation, it has no threshold. assert not hasattr(model, "feature_thresholds_") assert not hasattr(model, "aggregate_threshold_") assert not hasattr(model, "feature_thresholds_per_fold_") assert not hasattr(model, "aggregate_thresholds_per_fold_") # Calling cross validate should set the threshold for it. model.cross_validate(X=X, y=y) # Now we have calculated thresholds based on cross validation folds assert hasattr(model, "feature_thresholds_") assert hasattr(model, "aggregate_threshold_") assert hasattr(model, "feature_thresholds_per_fold_") assert hasattr(model, "aggregate_thresholds_per_fold_") assert isinstance(model.feature_thresholds_, pd.Series) assert len(model.feature_thresholds_) == y.shape[1] assert all(model.feature_thresholds_.notna()) assert isinstance(model.feature_thresholds_per_fold_, pd.DataFrame) assert isinstance(model.aggregate_thresholds_per_fold_, dict)
def test_diff_detector_require_thresholds(require_threshold: bool): """ Should fail if requiring thresholds, but not calling cross_validate """ X = pd.DataFrame(np.random.random((100, 5))) y = pd.DataFrame(np.random.random((100, 2))) model = DiffBasedAnomalyDetector( base_estimator=MultiOutputRegressor(LinearRegression()), require_thresholds=require_threshold, ) model.fit(X, y) if require_threshold: # FAIL: Forgot to call .cross_validate to calculate thresholds. with pytest.raises(AttributeError): model.anomaly(X, y) model.cross_validate(X=X, y=y) model.anomaly(X, y) else: # thresholds not required model.anomaly(X, y)
def test_diff_detector(scaler, index, lookback, with_thresholds: bool): """ Test the functionality of the DiffBasedAnomalyDetector """ # Some dataset. X, y = ( pd.DataFrame(np.random.random((10, 3))), pd.DataFrame(np.random.random((10, 3))), ) base_estimator = MultiOutputRegressor(estimator=LinearRegression()) model = DiffBasedAnomalyDetector(base_estimator=base_estimator, scaler=scaler, require_thresholds=False) assert isinstance(model, AnomalyDetectorBase) assert model.get_params() == dict(base_estimator=base_estimator, scaler=scaler) if with_thresholds: model.cross_validate(X=X, y=y) model.fit(X, y) output: np.ndarray = model.predict(X) base_df = model_utils.make_base_dataframe(tags=["A", "B", "C"], model_input=X, model_output=output, index=index) # Base prediction dataframe has none of these columns assert not any(col in base_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", )) # Apply the anomaly detection logic on the base prediction df anomaly_df = model.anomaly(X, y, timedelta(days=1)) # Should have these added error calculated columns now. assert all(col in anomaly_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", )) # Verify calculation for unscaled data feature_error_unscaled = np.abs(base_df["model-output"].values - y.values) total_anomaly_unscaled = np.square(feature_error_unscaled).mean(axis=1) assert np.allclose(feature_error_unscaled, anomaly_df["tag-anomaly-unscaled"].values) assert np.allclose(total_anomaly_unscaled, anomaly_df["total-anomaly-unscaled"].values) # Verify calculations for scaled data feature_error_scaled = np.abs( scaler.transform(base_df["model-output"].values) - scaler.transform(y)) total_anomaly_scaled = np.square(feature_error_scaled).mean(axis=1) assert np.allclose(feature_error_scaled, anomaly_df["tag-anomaly-scaled"].values) assert np.allclose(total_anomaly_scaled, anomaly_df["total-anomaly-scaled"].values) if with_thresholds: assert "anomaly-confidence" in anomaly_df.columns assert "total-anomaly-confidence" in anomaly_df.columns else: assert "anomaly-confidence" not in anomaly_df.columns assert "total-anomaly-confidence" not in anomaly_df.columns
def test_diff_detector_with_window(scaler, len_x_y: int, time_index: bool, lookback: int, with_thresholds: bool): """ Test the functionality of the DiffBasedAnomalyDetector """ # Some dataset. X, y = ( pd.DataFrame(np.random.random((len_x_y, 3))), pd.DataFrame(np.random.random((len_x_y, 3))), ) tags = ["A", "B", "C"] if time_index: index = pd.date_range("2019-01-01", "2019-01-11", periods=len_x_y) else: index = range(len_x_y) base_estimator = MultiOutputRegressor(estimator=LinearRegression()) model = DiffBasedAnomalyDetector( base_estimator=base_estimator, scaler=scaler, require_thresholds=False, window=144, ) assert isinstance(model, AnomalyDetectorBase) assert model.get_params() == dict(base_estimator=base_estimator, scaler=scaler, window=144) if with_thresholds: model.cross_validate(X=X, y=y) model.fit(X, y) output: np.ndarray = model.predict(X) base_df = model_utils.make_base_dataframe(tags=tags, model_input=X, model_output=output, index=index) # Base prediction dataframe has none of these columns assert not any(col in base_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", "smooth-total-anomaly-scaled", "smooth-total-anomaly-unscaled", "smooth-tag-anomaly-scaled", "smooth-tag-anomaly-unscaled", )) # Apply the anomaly detection logic on the base prediction df anomaly_df = model.anomaly(X, y) # Should have these added error calculated columns now. assert all(col in anomaly_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", "smooth-total-anomaly-scaled", "smooth-total-anomaly-unscaled", "smooth-tag-anomaly-scaled", "smooth-tag-anomaly-unscaled", )) # Verify calculation for unscaled data feature_error_unscaled = pd.DataFrame( data=np.abs(base_df["model-output"].to_numpy() - y.to_numpy()), index=index, columns=tags, ) total_anomaly_unscaled = pd.Series( data=np.square(feature_error_unscaled).mean(axis=1)) assert np.allclose(feature_error_unscaled.to_numpy(), anomaly_df["tag-anomaly-unscaled"].to_numpy()) assert np.allclose( total_anomaly_unscaled.to_numpy(), anomaly_df["total-anomaly-unscaled"].to_numpy(), ) smooth_feature_error_unscaled = (feature_error_unscaled.rolling( model.window).median().dropna()) smooth_total_anomaly_unscaled = (total_anomaly_unscaled.rolling( model.window).median().dropna()) assert np.allclose( smooth_feature_error_unscaled.to_numpy(), anomaly_df["smooth-tag-anomaly-unscaled"].dropna().to_numpy(), ) assert np.allclose( smooth_total_anomaly_unscaled.to_numpy(), anomaly_df["smooth-total-anomaly-unscaled"].dropna().to_numpy(), ) # Verify calculations for scaled data feature_error_scaled = pd.DataFrame( data=np.abs( scaler.transform(base_df["model-output"].to_numpy()) - scaler.transform(y)), index=index, columns=tags, ) total_anomaly_scaled = pd.Series(data=np.square(feature_error_scaled).mean( axis=1)) assert np.allclose(feature_error_scaled.to_numpy(), anomaly_df["tag-anomaly-scaled"].to_numpy()) assert np.allclose(total_anomaly_scaled, anomaly_df["total-anomaly-scaled"].to_numpy()) smooth_feature_error_scaled = (feature_error_scaled.rolling( model.window).median().dropna()) smooth_total_anomaly_scaled = (total_anomaly_scaled.rolling( model.window).median().dropna()) assert np.allclose( smooth_feature_error_scaled.to_numpy(), anomaly_df["smooth-tag-anomaly-scaled"].dropna().to_numpy(), ) assert np.allclose( smooth_total_anomaly_scaled.to_numpy(), anomaly_df["smooth-total-anomaly-scaled"].dropna().to_numpy(), ) # Check number of NA's is consistent with window size if len_x_y >= model.window: assert (anomaly_df["smooth-tag-anomaly-scaled"].isna().sum().sum() == (model.window - 1) * anomaly_df["smooth-tag-anomaly-scaled"].shape[1]) assert (anomaly_df["smooth-total-anomaly-scaled"].isna().sum() == model.window - 1) if with_thresholds: assert "anomaly-confidence" in anomaly_df.columns assert "total-anomaly-confidence" in anomaly_df.columns else: assert "anomaly-confidence" not in anomaly_df.columns assert "total-anomaly-confidence" not in anomaly_df.columns