Пример #1
0
def test_ml_server_dataframe_to_dict_and_back(sensors_str, use_test_project_tags):
    """
    Tests the flow of the server creating a dataframe from the model's data, putting into
    a dict of string to df. lists of values, and the client being able to reconstruct it back
    to the original dataframe (less the second level names)
    """
    # Run test with test project tag names
    if use_test_project_tags:
        tags = sensors_str
    # Run project with random names
    else:
        tags = [string.ascii_uppercase[i] for i in range(len(sensors_str))]

    # Some synthetic data
    original_input = np.random.random((10, len(tags)))
    model_output = np.random.random((10, len(tags)))

    # Convert this data into a dataframe with multi index columns
    df = model_utils.make_base_dataframe(tags, original_input, model_output)

    # Server then converts this into a dict which maps top level names to lists
    serialized = server_utils.dataframe_to_dict(df)

    # Client reproduces this dataframe
    df_clone = server_utils.dataframe_from_dict(serialized)

    # each subset of column under the top level names should be equal
    top_lvl_names = df.columns.get_level_values(0)
    for top_lvl_name in filter(lambda n: n not in ("start", "end"), top_lvl_names):
        assert np.allclose(df[top_lvl_name].values, df_clone[top_lvl_name].values)
Пример #2
0
def test_base_dataframe_creation(dates, tags, target_tag_list, output_offset):

    # Make model input based on tags
    size = len(dates if dates is not None else list(range(10))) * len(tags)
    model_input = np.random.random(size=size).reshape(-1, len(tags))

    # Model output based on target_tag_list
    size = len(dates if dates is not None else list(range(10))) * len(
        target_tag_list or list(range(20))
    )
    model_output = np.random.random(size=size).reshape((len(model_input), -1))

    # simulate where model's output length doesn't match it's input length
    # ie. as with an LSTM which outputs the offset of it's lookback window
    model_output = model_output[output_offset:]

    # pass in the arrays, of which model output's may be different lengths / shapes than the input
    # but should provide a valid dataframe in all cases.
    df = model_utils.make_base_dataframe(
        tags=tags,
        model_input=model_input,
        model_output=model_output,
        target_tag_list=target_tag_list,
        index=dates,
    )

    # offset column's like 'original-input' since it will be offsetted inside make_base_dataframe()
    assert np.array_equal(df["model-input"].values, model_input[-len(df) :, :])

    # Model input should always have column labels equal to the tags given
    assert df["model-input"].columns.tolist() == tags

    # Ensure model output matches
    assert np.array_equal(df["model-output"].values, model_output[-len(df) :, :])

    # Expected second level column names:
    # If target tags are defined, those should be the names
    if target_tag_list is not None:
        assert target_tag_list == df["model-output"].columns.tolist()

    # If they aren't defined, but model output shape matches input shape, tags should be the names.
    elif model_output.shape[1] == len(tags):
        assert tags == df["model-output"].columns.tolist()

    # Otherwise, column names should be simple range of feature length.
    else:
        assert (
            list(map(str, range(model_output.shape[1])))
            == df["model-output"].columns.tolist()
        )

    # Test expected index if dates were supplied or not
    if dates is not None:
        assert np.array_equal(df.index.values, dates.values[output_offset:])
    else:
        assert np.array_equal(df.index.values, np.arange(0, len(df)))
Пример #3
0
    def anomaly(self,
                X: pd.DataFrame,
                y: pd.DataFrame,
                frequency: Optional[timedelta] = None) -> pd.DataFrame:
        """
        Create an anomaly dataframe from the base provided dataframe.

        Parameters
        ----------
        X: pd.DataFrame
            Dataframe representing the data to go into the model.
        y: pd.DataFrame
            Dataframe representing the target output of the model.

        Returns
        -------
        pd.DataFrame
            A superset of the original base dataframe with added anomaly specific
            features
        """

        # Get the model output, falling back to transform if 'predict' doesn't exist
        model_output = (self.predict(X)
                        if hasattr(self, "predict") else self.transform(X))

        # Create the basic dataframe with 'model-output' & 'model-input'
        data = model_utils.make_base_dataframe(
            tags=X.columns,
            model_input=getattr(X, "values", X),
            model_output=model_output,
            target_tag_list=y.columns,
            index=getattr(X, "index", None),
            frequency=frequency,
        )

        model_out_scaled = pd.DataFrame(
            self.scaler.transform(data["model-output"]),
            columns=data["model-output"].columns,
            index=data.index,
        )

        # Calculate the absolute scaled tag anomaly
        # Ensure to offset the y to match model out, which could be less if it is a LSTM
        scaled_y = self.scaler.transform(y)
        tag_anomaly_scaled = np.abs(model_out_scaled -
                                    scaled_y[-len(data):, :])
        tag_anomaly_scaled.columns = pd.MultiIndex.from_product(
            (("tag-anomaly-scaled", ), tag_anomaly_scaled.columns))
        data = data.join(tag_anomaly_scaled)

        # Calculate scaled total anomaly
        data["total-anomaly-scaled"] = np.square(
            data["tag-anomaly-scaled"]).mean(axis=1)

        # Calculate the absolute unscaled tag anomalies
        unscaled_abs_diff = pd.DataFrame(
            data=np.abs(data["model-output"].to_numpy() -
                        y.to_numpy()[-len(data):, :]),
            index=data.index,
            columns=pd.MultiIndex.from_product(
                (("tag-anomaly-unscaled", ), y.columns.tolist())),
        )
        data = data.join(unscaled_abs_diff)

        # Calculate the scaled total anomaly
        data["total-anomaly-unscaled"] = np.square(
            data["tag-anomaly-unscaled"]).mean(axis=1)

        if self.window is not None and self.smoothing_method is not None:
            # Calculate scaled tag-level smoothed anomaly scores
            smooth_tag_anomaly_scaled = self._smoothing(tag_anomaly_scaled)
            smooth_tag_anomaly_scaled.columns = smooth_tag_anomaly_scaled.columns.set_levels(
                ["smooth-tag-anomaly-scaled"], level=0)
            data = data.join(smooth_tag_anomaly_scaled)

            # Calculate scaled smoothed total anomaly score
            data["smooth-total-anomaly-scaled"] = self._smoothing(
                data["total-anomaly-scaled"])

            # Calculate unscaled tag-level smoothed anomaly scores
            smooth_tag_anomaly_unscaled = self._smoothing(unscaled_abs_diff)

            smooth_tag_anomaly_unscaled.columns = smooth_tag_anomaly_unscaled.columns.set_levels(
                ["smooth-tag-anomaly-unscaled"], level=0)
            data = data.join(smooth_tag_anomaly_unscaled)

            # Calculate unscaled smoothed total anomaly score
            data["smooth-total-anomaly-unscaled"] = self._smoothing(
                data["total-anomaly-unscaled"])

        # If we have `thresholds_` values, then we can calculate anomaly confidence
        confidence, index = None, None

        if hasattr(self, "feature_thresholds_"):
            confidence = tag_anomaly_scaled.values / self.feature_thresholds_.values
            index = tag_anomaly_scaled.index

        if confidence is not None and index is not None:
            # Dataframe of % abs_diff is of the thresholds
            # This is now based on the smoothed tag anomaly
            anomaly_confidence_scores = pd.DataFrame(
                confidence,
                index=index,
                columns=pd.MultiIndex.from_product(
                    (("anomaly-confidence", ), data["model-output"].columns)),
            )
            data = data.join(anomaly_confidence_scores)

        total_anomaly_confidence = None

        if hasattr(self, "aggregate_threshold_"):
            total_anomaly_confidence = (data["total-anomaly-scaled"] /
                                        self.aggregate_threshold_)

        if total_anomaly_confidence is not None:
            data["total-anomaly-confidence"] = total_anomaly_confidence

        # Explicitly raise error if we were required to do threshold based calculations
        # should would have required a call to .cross_validate before .anomaly
        if self.require_thresholds and not any(
                hasattr(self, attr)
                for attr in ("feature_thresholds_", "aggregate_threshold_")):
            raise AttributeError(
                f"`require_thresholds={self.require_thresholds}` however "
                f"`.cross_validate` needs to be called in order to calculate these"
                f"thresholds before calling `.anomaly`")

        return data
Пример #4
0
def test_diff_detector(scaler, index, lookback, with_thresholds: bool):
    """
    Test the functionality of the DiffBasedAnomalyDetector
    """

    # Some dataset.
    X, y = (
        pd.DataFrame(np.random.random((10, 3))),
        pd.DataFrame(np.random.random((10, 3))),
    )

    base_estimator = MultiOutputRegressor(estimator=LinearRegression())
    model = DiffBasedAnomalyDetector(base_estimator=base_estimator,
                                     scaler=scaler,
                                     require_thresholds=False)

    assert isinstance(model, AnomalyDetectorBase)

    assert model.get_params() == dict(base_estimator=base_estimator,
                                      scaler=scaler)

    if with_thresholds:
        model.cross_validate(X=X, y=y)

    model.fit(X, y)

    output: np.ndarray = model.predict(X)
    base_df = model_utils.make_base_dataframe(tags=["A", "B", "C"],
                                              model_input=X,
                                              model_output=output,
                                              index=index)

    # Base prediction dataframe has none of these columns
    assert not any(col in base_df.columns for col in (
        "total-anomaly-scaled",
        "total-anomaly-unscaled",
        "tag-anomaly-scaled",
        "tag-anomaly-unscaled",
    ))

    # Apply the anomaly detection logic on the base prediction df
    anomaly_df = model.anomaly(X, y, timedelta(days=1))

    # Should have these added error calculated columns now.
    assert all(col in anomaly_df.columns for col in (
        "total-anomaly-scaled",
        "total-anomaly-unscaled",
        "tag-anomaly-scaled",
        "tag-anomaly-unscaled",
    ))

    # Verify calculation for unscaled data
    feature_error_unscaled = np.abs(base_df["model-output"].values - y.values)
    total_anomaly_unscaled = np.square(feature_error_unscaled).mean(axis=1)
    assert np.allclose(feature_error_unscaled,
                       anomaly_df["tag-anomaly-unscaled"].values)
    assert np.allclose(total_anomaly_unscaled,
                       anomaly_df["total-anomaly-unscaled"].values)

    # Verify calculations for scaled data
    feature_error_scaled = np.abs(
        scaler.transform(base_df["model-output"].values) - scaler.transform(y))
    total_anomaly_scaled = np.square(feature_error_scaled).mean(axis=1)
    assert np.allclose(feature_error_scaled,
                       anomaly_df["tag-anomaly-scaled"].values)
    assert np.allclose(total_anomaly_scaled,
                       anomaly_df["total-anomaly-scaled"].values)

    if with_thresholds:
        assert "anomaly-confidence" in anomaly_df.columns
        assert "total-anomaly-confidence" in anomaly_df.columns
    else:
        assert "anomaly-confidence" not in anomaly_df.columns
        assert "total-anomaly-confidence" not in anomaly_df.columns
Пример #5
0
    def post(self):
        """
        Process a POST request by using provided user data

        A typical response might look like this

        .. code-block:: python

            {
                'data': [
                    {
                        'end': ['2016-01-01T00:10:00+00:00'],
                        'model-output': [0.0005317790200933814,
                                         -0.0001525811239844188,
                                         0.0008310950361192226,
                                         0.0015755111817270517],
                        'original-input': [0.9135588550070414,
                                           0.3472517774179448,
                                           0.8994921857179736,
                                           0.11982773108991263],
                        'start': ['2016-01-01T00:00:00+00:00'],
                    },
                    ...
                ],

                'tags': [
                    {'asset': None, 'name': 'tag-0'},
                    {'asset': None, 'name': 'tag-1'},
                    {'asset': None, 'name': 'tag-2'},
                    {'asset': None, 'name': 'tag-3'}
                ],
                'time-seconds': '0.1937'
            }
        """
        context: typing.Dict[typing.Any, typing.Any] = dict()
        X = g.X
        process_request_start_time_s = timeit.default_timer()

        try:
            output = model_io.get_model_output(model=g.model, X=X)
        except ValueError as err:
            tb = traceback.format_exc()
            logger.error(
                f"Failed to predict or transform; error: {err} - \nTraceback: {tb}"
            )
            context["error"] = f"ValueError: {str(err)}"
            return make_response((jsonify(context), 400))

        # Model may only be a transformer, probably an AttributeError, but catch all to avoid logging other
        # exceptions twice if it happens.
        except Exception as exc:
            tb = traceback.format_exc()
            logger.error(
                f"Failed to predict or transform; error: {exc} - \nTraceback: {tb}"
            )
            context[
                "error"] = "Something unexpected happened; check your input data"
            return make_response((jsonify(context), 400))

        else:
            get_model_output_time_s = timeit.default_timer()
            logger.debug(
                f"Calculating model output took "
                f"{get_model_output_time_s-process_request_start_time_s} s")
            data = model_utils.make_base_dataframe(
                tags=self.tags,
                model_input=X.values if isinstance(X, pd.DataFrame) else X,
                model_output=output,
                target_tag_list=self.target_tags,
                index=X.index,
            )
            if request.args.get("format") == "parquet":
                return send_file(
                    io.BytesIO(
                        server_utils.dataframe_into_parquet_bytes(data)),
                    mimetype="application/octet-stream",
                )
            else:
                context["data"] = server_utils.dataframe_to_dict(data)
                return make_response(
                    (jsonify(context), context.pop("status-code", 200)))
Пример #6
0
def test_diff_detector_with_window(
    scaler,
    len_x_y: int,
    time_index: bool,
    with_thresholds: bool,
    shuffle: bool,
    window,
    smoothing_method,
):
    """
    Test the functionality of the DiffBasedAnomalyDetector with window
    """

    # Some dataset.
    X, y = (
        pd.DataFrame(np.random.random((len_x_y, 3))),
        pd.DataFrame(np.random.random((len_x_y, 3))),
    )
    tags = ["A", "B", "C"]
    if time_index:
        index = pd.date_range("2019-01-01", "2019-01-11", periods=len_x_y)
    else:
        index = range(len_x_y)

    base_estimator = MultiOutputRegressor(estimator=LinearRegression())
    model = DiffBasedAnomalyDetector(
        base_estimator=base_estimator,
        scaler=scaler,
        require_thresholds=with_thresholds,
        shuffle=shuffle,
        window=window,
        smoothing_method=smoothing_method,
    )

    assert isinstance(model, AnomalyDetectorBase)

    if window is None:
        assert model.get_params() == dict(
            base_estimator=base_estimator,
            scaler=scaler,
            shuffle=shuffle,
        )

    elif window is not None and smoothing_method is None:
        assert model.get_params() == dict(
            base_estimator=base_estimator,
            scaler=scaler,
            shuffle=shuffle,
            window=window,
            smoothing_method="smm",
        )

    else:
        assert model.get_params() == dict(
            base_estimator=base_estimator,
            scaler=scaler,
            shuffle=shuffle,
            window=window,
            smoothing_method=smoothing_method,
        )

    if with_thresholds:
        model.cross_validate(X=X, y=y)

    model.fit(X, y)

    output: np.ndarray = model.predict(X)
    base_df = model_utils.make_base_dataframe(tags=tags,
                                              model_input=X,
                                              model_output=output,
                                              index=index)
    # Base prediction dataframe has none of these columns
    assert not any(col in base_df.columns for col in (
        "total-anomaly-scaled",
        "total-anomaly-unscaled",
        "tag-anomaly-scaled",
        "tag-anomaly-unscaled",
        "smooth-total-anomaly-scaled",
        "smooth-total-anomaly-unscaled",
        "smooth-tag-anomaly-scaled",
        "smooth-tag-anomaly-unscaled",
    ))

    # Apply the anomaly detection logic on the base prediction df
    anomaly_df = model.anomaly(X, y)

    # Should have these added error calculated columns now.
    if window is not None:
        assert all(col in anomaly_df.columns for col in (
            "total-anomaly-scaled",
            "total-anomaly-unscaled",
            "tag-anomaly-scaled",
            "tag-anomaly-unscaled",
            "smooth-total-anomaly-scaled",
            "smooth-total-anomaly-unscaled",
            "smooth-tag-anomaly-scaled",
            "smooth-tag-anomaly-unscaled",
        ))
    else:
        assert all(col in anomaly_df.columns for col in (
            "total-anomaly-scaled",
            "total-anomaly-unscaled",
            "tag-anomaly-scaled",
            "tag-anomaly-unscaled",
        ))
        assert not any(col in base_df.columns for col in (
            "smooth-total-anomaly-scaled",
            "smooth-total-anomaly-unscaled",
            "smooth-tag-anomaly-scaled",
            "smooth-tag-anomaly-unscaled",
        ))

    # Verify calculation for unscaled data
    feature_error_unscaled = pd.DataFrame(
        data=np.abs(base_df["model-output"].to_numpy() - y.to_numpy()),
        index=index,
        columns=tags,
    )
    total_anomaly_unscaled = pd.Series(
        data=np.square(feature_error_unscaled).mean(axis=1))
    assert np.allclose(feature_error_unscaled.to_numpy(),
                       anomaly_df["tag-anomaly-unscaled"].to_numpy())
    assert np.allclose(
        total_anomaly_unscaled.to_numpy(),
        anomaly_df["total-anomaly-unscaled"].to_numpy(),
    )

    if window is not None:
        if smoothing_method is None or smoothing_method == "smm":
            smooth_feature_error_unscaled = (feature_error_unscaled.rolling(
                model.window).median().dropna())
            smooth_total_anomaly_unscaled = (total_anomaly_unscaled.rolling(
                model.window).median().dropna())

        elif smoothing_method == "sma":
            smooth_feature_error_unscaled = (feature_error_unscaled.rolling(
                model.window).mean().dropna())
            smooth_total_anomaly_unscaled = (total_anomaly_unscaled.rolling(
                model.window).mean().dropna())
        elif smoothing_method == "ewma":
            smooth_feature_error_unscaled = feature_error_unscaled.ewm(
                span=model.window).mean()
            smooth_total_anomaly_unscaled = total_anomaly_unscaled.ewm(
                span=model.window).mean()

        assert np.allclose(
            smooth_feature_error_unscaled.to_numpy(),
            anomaly_df["smooth-tag-anomaly-unscaled"].dropna().to_numpy(),
        )
        assert np.allclose(
            smooth_total_anomaly_unscaled.to_numpy(),
            anomaly_df["smooth-total-anomaly-unscaled"].dropna().to_numpy(),
        )

    # Verify calculations for scaled data
    feature_error_scaled = pd.DataFrame(
        data=np.abs(
            scaler.transform(base_df["model-output"].to_numpy()) -
            scaler.transform(y)),
        index=index,
        columns=tags,
    )
    total_anomaly_scaled = pd.Series(data=np.square(feature_error_scaled).mean(
        axis=1))
    assert np.allclose(feature_error_scaled.to_numpy(),
                       anomaly_df["tag-anomaly-scaled"].to_numpy())
    assert np.allclose(total_anomaly_scaled,
                       anomaly_df["total-anomaly-scaled"].to_numpy())

    if window is not None:
        if smoothing_method is None or smoothing_method == "smm":
            smooth_feature_error_scaled = (feature_error_scaled.rolling(
                model.window).median().dropna())
            smooth_total_anomaly_scaled = (total_anomaly_scaled.rolling(
                model.window).median().dropna())
        elif smoothing_method == "sma":
            smooth_feature_error_scaled = (feature_error_scaled.rolling(
                model.window).mean().dropna())
            smooth_total_anomaly_scaled = (total_anomaly_scaled.rolling(
                model.window).mean().dropna())
        elif smoothing_method == "ewma":
            smooth_feature_error_scaled = feature_error_scaled.ewm(
                span=model.window).mean()
            smooth_total_anomaly_scaled = total_anomaly_scaled.ewm(
                span=model.window).mean()

        assert np.allclose(
            smooth_feature_error_scaled.to_numpy(),
            anomaly_df["smooth-tag-anomaly-scaled"].dropna().to_numpy(),
        )
        assert np.allclose(
            smooth_total_anomaly_scaled.to_numpy(),
            anomaly_df["smooth-total-anomaly-scaled"].dropna().to_numpy(),
        )

    # Check number of NA's is consistent with window size
    if (smoothing_method != "ewma" and model.window is not None
            and len_x_y >= model.window):
        assert (anomaly_df["smooth-tag-anomaly-scaled"].isna().sum().sum() ==
                (model.window - 1) *
                anomaly_df["smooth-tag-anomaly-scaled"].shape[1])
        assert (anomaly_df["smooth-total-anomaly-scaled"].isna().sum() ==
                model.window - 1)

    if with_thresholds:
        assert "anomaly-confidence" in anomaly_df.columns
        assert "total-anomaly-confidence" in anomaly_df.columns
        assert anomaly_df["anomaly-confidence"].notnull().to_numpy().all()
        assert anomaly_df["total-anomaly-confidence"].notnull().to_numpy().all(
        )
    else:
        assert "anomaly-confidence" not in anomaly_df.columns
        assert "total-anomaly-confidence" not in anomaly_df.columns
Пример #7
0
def test_diff_kfcv_detector(
    scaler,
    index,
    with_thresholds: bool,
    shuffle: bool,
    window: int,
    smoothing_method: str,
    threshold_percentile: float,
):
    """
    Test the functionality of the DiffBasedKFCVAnomalyDetector
    """

    # Some dataset.
    X, y = (
        pd.DataFrame(np.random.random((300, 3))),
        pd.DataFrame(np.random.random((300, 3))),
    )

    base_estimator = MultiOutputRegressor(estimator=LinearRegression())
    model = DiffBasedKFCVAnomalyDetector(
        base_estimator=base_estimator,
        scaler=scaler,
        require_thresholds=with_thresholds,
        shuffle=shuffle,
        window=window,
        smoothing_method=smoothing_method,
        threshold_percentile=threshold_percentile,
    )

    assert isinstance(model, AnomalyDetectorBase)

    assert model.get_params() == dict(
        base_estimator=base_estimator,
        scaler=scaler,
        window=window,
        smoothing_method=smoothing_method,
        shuffle=shuffle,
        threshold_percentile=threshold_percentile,
    )

    if with_thresholds:
        model.cross_validate(X=X, y=y)

    model.fit(X, y)

    output: np.ndarray = model.predict(X)
    base_df = model_utils.make_base_dataframe(
        tags=["A", "B", "C"], model_input=X, model_output=output, index=index
    )

    # Base prediction dataframe has none of these columns
    assert not any(
        col in base_df.columns
        for col in (
            "total-anomaly-scaled",
            "total-anomaly-unscaled",
            "tag-anomaly-scaled",
            "tag-anomaly-unscaled",
        )
    )

    # Apply the anomaly detection logic on the base prediction df
    anomaly_df = model.anomaly(X, y, timedelta(days=1))

    # Should have these added error calculated columns now.
    assert all(
        col in anomaly_df.columns
        for col in (
            "total-anomaly-scaled",
            "total-anomaly-unscaled",
            "tag-anomaly-scaled",
            "tag-anomaly-unscaled",
        )
    )

    # Verify calculation for unscaled data
    feature_error_unscaled = np.abs(base_df["model-output"].values - y.values)
    total_anomaly_unscaled = np.square(feature_error_unscaled).mean(axis=1)
    assert np.allclose(
        feature_error_unscaled, anomaly_df["tag-anomaly-unscaled"].values
    )
    assert np.allclose(
        total_anomaly_unscaled, anomaly_df["total-anomaly-unscaled"].values
    )

    # Verify calculations for scaled data
    feature_error_scaled = np.abs(
        scaler.transform(base_df["model-output"].values) - scaler.transform(y)
    )
    total_anomaly_scaled = np.square(feature_error_scaled).mean(axis=1)
    assert np.allclose(feature_error_scaled, anomaly_df["tag-anomaly-scaled"].values)
    assert np.allclose(total_anomaly_scaled, anomaly_df["total-anomaly-scaled"].values)

    if with_thresholds:
        metadata = model.get_metadata()
        assert not any(np.isnan(metadata["feature-thresholds"]))
        assert not np.isnan(metadata["aggregate-threshold"])
        assert "anomaly-confidence" in anomaly_df.columns
        assert "total-anomaly-confidence" in anomaly_df.columns
        assert anomaly_df["anomaly-confidence"].notnull().to_numpy().all()
        assert anomaly_df["total-anomaly-confidence"].notnull().to_numpy().all()
    else:
        assert "anomaly-confidence" not in anomaly_df.columns
        assert "total-anomaly-confidence" not in anomaly_df.columns