def test_ml_server_dataframe_to_dict_and_back(sensors_str, use_test_project_tags): """ Tests the flow of the server creating a dataframe from the model's data, putting into a dict of string to df. lists of values, and the client being able to reconstruct it back to the original dataframe (less the second level names) """ # Run test with test project tag names if use_test_project_tags: tags = sensors_str # Run project with random names else: tags = [string.ascii_uppercase[i] for i in range(len(sensors_str))] # Some synthetic data original_input = np.random.random((10, len(tags))) model_output = np.random.random((10, len(tags))) # Convert this data into a dataframe with multi index columns df = model_utils.make_base_dataframe(tags, original_input, model_output) # Server then converts this into a dict which maps top level names to lists serialized = server_utils.dataframe_to_dict(df) # Client reproduces this dataframe df_clone = server_utils.dataframe_from_dict(serialized) # each subset of column under the top level names should be equal top_lvl_names = df.columns.get_level_values(0) for top_lvl_name in filter(lambda n: n not in ("start", "end"), top_lvl_names): assert np.allclose(df[top_lvl_name].values, df_clone[top_lvl_name].values)
def test_base_dataframe_creation(dates, tags, target_tag_list, output_offset): # Make model input based on tags size = len(dates if dates is not None else list(range(10))) * len(tags) model_input = np.random.random(size=size).reshape(-1, len(tags)) # Model output based on target_tag_list size = len(dates if dates is not None else list(range(10))) * len( target_tag_list or list(range(20)) ) model_output = np.random.random(size=size).reshape((len(model_input), -1)) # simulate where model's output length doesn't match it's input length # ie. as with an LSTM which outputs the offset of it's lookback window model_output = model_output[output_offset:] # pass in the arrays, of which model output's may be different lengths / shapes than the input # but should provide a valid dataframe in all cases. df = model_utils.make_base_dataframe( tags=tags, model_input=model_input, model_output=model_output, target_tag_list=target_tag_list, index=dates, ) # offset column's like 'original-input' since it will be offsetted inside make_base_dataframe() assert np.array_equal(df["model-input"].values, model_input[-len(df) :, :]) # Model input should always have column labels equal to the tags given assert df["model-input"].columns.tolist() == tags # Ensure model output matches assert np.array_equal(df["model-output"].values, model_output[-len(df) :, :]) # Expected second level column names: # If target tags are defined, those should be the names if target_tag_list is not None: assert target_tag_list == df["model-output"].columns.tolist() # If they aren't defined, but model output shape matches input shape, tags should be the names. elif model_output.shape[1] == len(tags): assert tags == df["model-output"].columns.tolist() # Otherwise, column names should be simple range of feature length. else: assert ( list(map(str, range(model_output.shape[1]))) == df["model-output"].columns.tolist() ) # Test expected index if dates were supplied or not if dates is not None: assert np.array_equal(df.index.values, dates.values[output_offset:]) else: assert np.array_equal(df.index.values, np.arange(0, len(df)))
def anomaly(self, X: pd.DataFrame, y: pd.DataFrame, frequency: Optional[timedelta] = None) -> pd.DataFrame: """ Create an anomaly dataframe from the base provided dataframe. Parameters ---------- X: pd.DataFrame Dataframe representing the data to go into the model. y: pd.DataFrame Dataframe representing the target output of the model. Returns ------- pd.DataFrame A superset of the original base dataframe with added anomaly specific features """ # Get the model output, falling back to transform if 'predict' doesn't exist model_output = (self.predict(X) if hasattr(self, "predict") else self.transform(X)) # Create the basic dataframe with 'model-output' & 'model-input' data = model_utils.make_base_dataframe( tags=X.columns, model_input=getattr(X, "values", X), model_output=model_output, target_tag_list=y.columns, index=getattr(X, "index", None), frequency=frequency, ) model_out_scaled = pd.DataFrame( self.scaler.transform(data["model-output"]), columns=data["model-output"].columns, index=data.index, ) # Calculate the absolute scaled tag anomaly # Ensure to offset the y to match model out, which could be less if it is a LSTM scaled_y = self.scaler.transform(y) tag_anomaly_scaled = np.abs(model_out_scaled - scaled_y[-len(data):, :]) tag_anomaly_scaled.columns = pd.MultiIndex.from_product( (("tag-anomaly-scaled", ), tag_anomaly_scaled.columns)) data = data.join(tag_anomaly_scaled) # Calculate scaled total anomaly data["total-anomaly-scaled"] = np.square( data["tag-anomaly-scaled"]).mean(axis=1) # Calculate the absolute unscaled tag anomalies unscaled_abs_diff = pd.DataFrame( data=np.abs(data["model-output"].to_numpy() - y.to_numpy()[-len(data):, :]), index=data.index, columns=pd.MultiIndex.from_product( (("tag-anomaly-unscaled", ), y.columns.tolist())), ) data = data.join(unscaled_abs_diff) # Calculate the scaled total anomaly data["total-anomaly-unscaled"] = np.square( data["tag-anomaly-unscaled"]).mean(axis=1) if self.window is not None and self.smoothing_method is not None: # Calculate scaled tag-level smoothed anomaly scores smooth_tag_anomaly_scaled = self._smoothing(tag_anomaly_scaled) smooth_tag_anomaly_scaled.columns = smooth_tag_anomaly_scaled.columns.set_levels( ["smooth-tag-anomaly-scaled"], level=0) data = data.join(smooth_tag_anomaly_scaled) # Calculate scaled smoothed total anomaly score data["smooth-total-anomaly-scaled"] = self._smoothing( data["total-anomaly-scaled"]) # Calculate unscaled tag-level smoothed anomaly scores smooth_tag_anomaly_unscaled = self._smoothing(unscaled_abs_diff) smooth_tag_anomaly_unscaled.columns = smooth_tag_anomaly_unscaled.columns.set_levels( ["smooth-tag-anomaly-unscaled"], level=0) data = data.join(smooth_tag_anomaly_unscaled) # Calculate unscaled smoothed total anomaly score data["smooth-total-anomaly-unscaled"] = self._smoothing( data["total-anomaly-unscaled"]) # If we have `thresholds_` values, then we can calculate anomaly confidence confidence, index = None, None if hasattr(self, "feature_thresholds_"): confidence = tag_anomaly_scaled.values / self.feature_thresholds_.values index = tag_anomaly_scaled.index if confidence is not None and index is not None: # Dataframe of % abs_diff is of the thresholds # This is now based on the smoothed tag anomaly anomaly_confidence_scores = pd.DataFrame( confidence, index=index, columns=pd.MultiIndex.from_product( (("anomaly-confidence", ), data["model-output"].columns)), ) data = data.join(anomaly_confidence_scores) total_anomaly_confidence = None if hasattr(self, "aggregate_threshold_"): total_anomaly_confidence = (data["total-anomaly-scaled"] / self.aggregate_threshold_) if total_anomaly_confidence is not None: data["total-anomaly-confidence"] = total_anomaly_confidence # Explicitly raise error if we were required to do threshold based calculations # should would have required a call to .cross_validate before .anomaly if self.require_thresholds and not any( hasattr(self, attr) for attr in ("feature_thresholds_", "aggregate_threshold_")): raise AttributeError( f"`require_thresholds={self.require_thresholds}` however " f"`.cross_validate` needs to be called in order to calculate these" f"thresholds before calling `.anomaly`") return data
def test_diff_detector(scaler, index, lookback, with_thresholds: bool): """ Test the functionality of the DiffBasedAnomalyDetector """ # Some dataset. X, y = ( pd.DataFrame(np.random.random((10, 3))), pd.DataFrame(np.random.random((10, 3))), ) base_estimator = MultiOutputRegressor(estimator=LinearRegression()) model = DiffBasedAnomalyDetector(base_estimator=base_estimator, scaler=scaler, require_thresholds=False) assert isinstance(model, AnomalyDetectorBase) assert model.get_params() == dict(base_estimator=base_estimator, scaler=scaler) if with_thresholds: model.cross_validate(X=X, y=y) model.fit(X, y) output: np.ndarray = model.predict(X) base_df = model_utils.make_base_dataframe(tags=["A", "B", "C"], model_input=X, model_output=output, index=index) # Base prediction dataframe has none of these columns assert not any(col in base_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", )) # Apply the anomaly detection logic on the base prediction df anomaly_df = model.anomaly(X, y, timedelta(days=1)) # Should have these added error calculated columns now. assert all(col in anomaly_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", )) # Verify calculation for unscaled data feature_error_unscaled = np.abs(base_df["model-output"].values - y.values) total_anomaly_unscaled = np.square(feature_error_unscaled).mean(axis=1) assert np.allclose(feature_error_unscaled, anomaly_df["tag-anomaly-unscaled"].values) assert np.allclose(total_anomaly_unscaled, anomaly_df["total-anomaly-unscaled"].values) # Verify calculations for scaled data feature_error_scaled = np.abs( scaler.transform(base_df["model-output"].values) - scaler.transform(y)) total_anomaly_scaled = np.square(feature_error_scaled).mean(axis=1) assert np.allclose(feature_error_scaled, anomaly_df["tag-anomaly-scaled"].values) assert np.allclose(total_anomaly_scaled, anomaly_df["total-anomaly-scaled"].values) if with_thresholds: assert "anomaly-confidence" in anomaly_df.columns assert "total-anomaly-confidence" in anomaly_df.columns else: assert "anomaly-confidence" not in anomaly_df.columns assert "total-anomaly-confidence" not in anomaly_df.columns
def post(self): """ Process a POST request by using provided user data A typical response might look like this .. code-block:: python { 'data': [ { 'end': ['2016-01-01T00:10:00+00:00'], 'model-output': [0.0005317790200933814, -0.0001525811239844188, 0.0008310950361192226, 0.0015755111817270517], 'original-input': [0.9135588550070414, 0.3472517774179448, 0.8994921857179736, 0.11982773108991263], 'start': ['2016-01-01T00:00:00+00:00'], }, ... ], 'tags': [ {'asset': None, 'name': 'tag-0'}, {'asset': None, 'name': 'tag-1'}, {'asset': None, 'name': 'tag-2'}, {'asset': None, 'name': 'tag-3'} ], 'time-seconds': '0.1937' } """ context: typing.Dict[typing.Any, typing.Any] = dict() X = g.X process_request_start_time_s = timeit.default_timer() try: output = model_io.get_model_output(model=g.model, X=X) except ValueError as err: tb = traceback.format_exc() logger.error( f"Failed to predict or transform; error: {err} - \nTraceback: {tb}" ) context["error"] = f"ValueError: {str(err)}" return make_response((jsonify(context), 400)) # Model may only be a transformer, probably an AttributeError, but catch all to avoid logging other # exceptions twice if it happens. except Exception as exc: tb = traceback.format_exc() logger.error( f"Failed to predict or transform; error: {exc} - \nTraceback: {tb}" ) context[ "error"] = "Something unexpected happened; check your input data" return make_response((jsonify(context), 400)) else: get_model_output_time_s = timeit.default_timer() logger.debug( f"Calculating model output took " f"{get_model_output_time_s-process_request_start_time_s} s") data = model_utils.make_base_dataframe( tags=self.tags, model_input=X.values if isinstance(X, pd.DataFrame) else X, model_output=output, target_tag_list=self.target_tags, index=X.index, ) if request.args.get("format") == "parquet": return send_file( io.BytesIO( server_utils.dataframe_into_parquet_bytes(data)), mimetype="application/octet-stream", ) else: context["data"] = server_utils.dataframe_to_dict(data) return make_response( (jsonify(context), context.pop("status-code", 200)))
def test_diff_detector_with_window( scaler, len_x_y: int, time_index: bool, with_thresholds: bool, shuffle: bool, window, smoothing_method, ): """ Test the functionality of the DiffBasedAnomalyDetector with window """ # Some dataset. X, y = ( pd.DataFrame(np.random.random((len_x_y, 3))), pd.DataFrame(np.random.random((len_x_y, 3))), ) tags = ["A", "B", "C"] if time_index: index = pd.date_range("2019-01-01", "2019-01-11", periods=len_x_y) else: index = range(len_x_y) base_estimator = MultiOutputRegressor(estimator=LinearRegression()) model = DiffBasedAnomalyDetector( base_estimator=base_estimator, scaler=scaler, require_thresholds=with_thresholds, shuffle=shuffle, window=window, smoothing_method=smoothing_method, ) assert isinstance(model, AnomalyDetectorBase) if window is None: assert model.get_params() == dict( base_estimator=base_estimator, scaler=scaler, shuffle=shuffle, ) elif window is not None and smoothing_method is None: assert model.get_params() == dict( base_estimator=base_estimator, scaler=scaler, shuffle=shuffle, window=window, smoothing_method="smm", ) else: assert model.get_params() == dict( base_estimator=base_estimator, scaler=scaler, shuffle=shuffle, window=window, smoothing_method=smoothing_method, ) if with_thresholds: model.cross_validate(X=X, y=y) model.fit(X, y) output: np.ndarray = model.predict(X) base_df = model_utils.make_base_dataframe(tags=tags, model_input=X, model_output=output, index=index) # Base prediction dataframe has none of these columns assert not any(col in base_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", "smooth-total-anomaly-scaled", "smooth-total-anomaly-unscaled", "smooth-tag-anomaly-scaled", "smooth-tag-anomaly-unscaled", )) # Apply the anomaly detection logic on the base prediction df anomaly_df = model.anomaly(X, y) # Should have these added error calculated columns now. if window is not None: assert all(col in anomaly_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", "smooth-total-anomaly-scaled", "smooth-total-anomaly-unscaled", "smooth-tag-anomaly-scaled", "smooth-tag-anomaly-unscaled", )) else: assert all(col in anomaly_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", )) assert not any(col in base_df.columns for col in ( "smooth-total-anomaly-scaled", "smooth-total-anomaly-unscaled", "smooth-tag-anomaly-scaled", "smooth-tag-anomaly-unscaled", )) # Verify calculation for unscaled data feature_error_unscaled = pd.DataFrame( data=np.abs(base_df["model-output"].to_numpy() - y.to_numpy()), index=index, columns=tags, ) total_anomaly_unscaled = pd.Series( data=np.square(feature_error_unscaled).mean(axis=1)) assert np.allclose(feature_error_unscaled.to_numpy(), anomaly_df["tag-anomaly-unscaled"].to_numpy()) assert np.allclose( total_anomaly_unscaled.to_numpy(), anomaly_df["total-anomaly-unscaled"].to_numpy(), ) if window is not None: if smoothing_method is None or smoothing_method == "smm": smooth_feature_error_unscaled = (feature_error_unscaled.rolling( model.window).median().dropna()) smooth_total_anomaly_unscaled = (total_anomaly_unscaled.rolling( model.window).median().dropna()) elif smoothing_method == "sma": smooth_feature_error_unscaled = (feature_error_unscaled.rolling( model.window).mean().dropna()) smooth_total_anomaly_unscaled = (total_anomaly_unscaled.rolling( model.window).mean().dropna()) elif smoothing_method == "ewma": smooth_feature_error_unscaled = feature_error_unscaled.ewm( span=model.window).mean() smooth_total_anomaly_unscaled = total_anomaly_unscaled.ewm( span=model.window).mean() assert np.allclose( smooth_feature_error_unscaled.to_numpy(), anomaly_df["smooth-tag-anomaly-unscaled"].dropna().to_numpy(), ) assert np.allclose( smooth_total_anomaly_unscaled.to_numpy(), anomaly_df["smooth-total-anomaly-unscaled"].dropna().to_numpy(), ) # Verify calculations for scaled data feature_error_scaled = pd.DataFrame( data=np.abs( scaler.transform(base_df["model-output"].to_numpy()) - scaler.transform(y)), index=index, columns=tags, ) total_anomaly_scaled = pd.Series(data=np.square(feature_error_scaled).mean( axis=1)) assert np.allclose(feature_error_scaled.to_numpy(), anomaly_df["tag-anomaly-scaled"].to_numpy()) assert np.allclose(total_anomaly_scaled, anomaly_df["total-anomaly-scaled"].to_numpy()) if window is not None: if smoothing_method is None or smoothing_method == "smm": smooth_feature_error_scaled = (feature_error_scaled.rolling( model.window).median().dropna()) smooth_total_anomaly_scaled = (total_anomaly_scaled.rolling( model.window).median().dropna()) elif smoothing_method == "sma": smooth_feature_error_scaled = (feature_error_scaled.rolling( model.window).mean().dropna()) smooth_total_anomaly_scaled = (total_anomaly_scaled.rolling( model.window).mean().dropna()) elif smoothing_method == "ewma": smooth_feature_error_scaled = feature_error_scaled.ewm( span=model.window).mean() smooth_total_anomaly_scaled = total_anomaly_scaled.ewm( span=model.window).mean() assert np.allclose( smooth_feature_error_scaled.to_numpy(), anomaly_df["smooth-tag-anomaly-scaled"].dropna().to_numpy(), ) assert np.allclose( smooth_total_anomaly_scaled.to_numpy(), anomaly_df["smooth-total-anomaly-scaled"].dropna().to_numpy(), ) # Check number of NA's is consistent with window size if (smoothing_method != "ewma" and model.window is not None and len_x_y >= model.window): assert (anomaly_df["smooth-tag-anomaly-scaled"].isna().sum().sum() == (model.window - 1) * anomaly_df["smooth-tag-anomaly-scaled"].shape[1]) assert (anomaly_df["smooth-total-anomaly-scaled"].isna().sum() == model.window - 1) if with_thresholds: assert "anomaly-confidence" in anomaly_df.columns assert "total-anomaly-confidence" in anomaly_df.columns assert anomaly_df["anomaly-confidence"].notnull().to_numpy().all() assert anomaly_df["total-anomaly-confidence"].notnull().to_numpy().all( ) else: assert "anomaly-confidence" not in anomaly_df.columns assert "total-anomaly-confidence" not in anomaly_df.columns
def test_diff_kfcv_detector( scaler, index, with_thresholds: bool, shuffle: bool, window: int, smoothing_method: str, threshold_percentile: float, ): """ Test the functionality of the DiffBasedKFCVAnomalyDetector """ # Some dataset. X, y = ( pd.DataFrame(np.random.random((300, 3))), pd.DataFrame(np.random.random((300, 3))), ) base_estimator = MultiOutputRegressor(estimator=LinearRegression()) model = DiffBasedKFCVAnomalyDetector( base_estimator=base_estimator, scaler=scaler, require_thresholds=with_thresholds, shuffle=shuffle, window=window, smoothing_method=smoothing_method, threshold_percentile=threshold_percentile, ) assert isinstance(model, AnomalyDetectorBase) assert model.get_params() == dict( base_estimator=base_estimator, scaler=scaler, window=window, smoothing_method=smoothing_method, shuffle=shuffle, threshold_percentile=threshold_percentile, ) if with_thresholds: model.cross_validate(X=X, y=y) model.fit(X, y) output: np.ndarray = model.predict(X) base_df = model_utils.make_base_dataframe( tags=["A", "B", "C"], model_input=X, model_output=output, index=index ) # Base prediction dataframe has none of these columns assert not any( col in base_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", ) ) # Apply the anomaly detection logic on the base prediction df anomaly_df = model.anomaly(X, y, timedelta(days=1)) # Should have these added error calculated columns now. assert all( col in anomaly_df.columns for col in ( "total-anomaly-scaled", "total-anomaly-unscaled", "tag-anomaly-scaled", "tag-anomaly-unscaled", ) ) # Verify calculation for unscaled data feature_error_unscaled = np.abs(base_df["model-output"].values - y.values) total_anomaly_unscaled = np.square(feature_error_unscaled).mean(axis=1) assert np.allclose( feature_error_unscaled, anomaly_df["tag-anomaly-unscaled"].values ) assert np.allclose( total_anomaly_unscaled, anomaly_df["total-anomaly-unscaled"].values ) # Verify calculations for scaled data feature_error_scaled = np.abs( scaler.transform(base_df["model-output"].values) - scaler.transform(y) ) total_anomaly_scaled = np.square(feature_error_scaled).mean(axis=1) assert np.allclose(feature_error_scaled, anomaly_df["tag-anomaly-scaled"].values) assert np.allclose(total_anomaly_scaled, anomaly_df["total-anomaly-scaled"].values) if with_thresholds: metadata = model.get_metadata() assert not any(np.isnan(metadata["feature-thresholds"])) assert not np.isnan(metadata["aggregate-threshold"]) assert "anomaly-confidence" in anomaly_df.columns assert "total-anomaly-confidence" in anomaly_df.columns assert anomaly_df["anomaly-confidence"].notnull().to_numpy().all() assert anomaly_df["total-anomaly-confidence"].notnull().to_numpy().all() else: assert "anomaly-confidence" not in anomaly_df.columns assert "total-anomaly-confidence" not in anomaly_df.columns