def __init__( self, base_estimator: BaseEstimator = KerasAutoEncoder(kind="feedforward_hourglass"), scaler: TransformerMixin = RobustScaler(), ): """ Classifier which wraps a ``base_estimator`` and provides a diff error based approach to anomaly detection. It trains a ``scaler`` to the target **after** training, purely for error calculations. The underlying ``base_estimator`` is trained with the original, unscaled, ``y``. Parameters ---------- base_estimator: sklearn.base.BaseEstimator The model to which normal ``.fit``, ``.predict`` methods will be used. defaults to py:class:`gordo_components.model.models.KerasAutoEncoder` with ``kind='feedforward_hourglass`` scaler: sklearn.base.TransformerMixn Defaults to ``sklearn.preprocessing.RobustScaler`` Used for transforming model output and the original ``y`` to calculate the difference/error in model output vs expected. """ self.base_estimator = base_estimator self.scaler = scaler
def test_captures_kwarg_to_init(): """ Our models allow kwargs which are put into the underlying keras model or to construct the underlying model. We want to ensure into defintion captures kwargs which are part of the model parameters but not part of the __init__ signature """ ae = KerasAutoEncoder(kind="feedforward_hourglass", some_fancy_param="Howdy!") definition = pipeline_into_definition(ae) parameters = definition[ f"{KerasAutoEncoder.__module__}.{KerasAutoEncoder.__name__}"] assert "some_fancy_param" in parameters assert parameters["some_fancy_param"] == "Howdy!" # And make sure we can init again KerasAutoEncoder(**parameters)
def test_dump_load_keras_directly(self): model = KerasAutoEncoder(kind="feedforward_hourglass") X = np.random.random(size=100).reshape(10, 10) model.fit(X.copy(), X.copy()) with TemporaryDirectory() as tmp: serializer.dump(model, tmp) model_clone = serializer.load(tmp) self.assertTrue( np.allclose( model.predict(X.copy()).flatten(), model_clone.predict(X.copy()).flatten(), ))
def setUp(self): self.variations_of_same_pipeline = [ # Normal Pipeline( [ ("pca1", PCA(n_components=2)), ( "fu", FeatureUnion( [ ("pca2", PCA(n_components=3)), ( "pipe", Pipeline( [ ("minmax", MinMaxScaler()), ("truncsvd", TruncatedSVD(n_components=2)), ] ), ), ] ), ), ("ae", KerasAutoEncoder(kind="feedforward_hourglass")), ] ), # MinMax initialized (wrongly) with a list Pipeline( [ ("pca1", PCA(n_components=2)), ( "fu", FeatureUnion( [ ("pca2", PCA(n_components=3)), ( "pipe", Pipeline( [ ("minmax", MinMaxScaler([0, 1])), ("truncsvd", TruncatedSVD(n_components=2)), ] ), ), ] ), ), ("ae", KerasAutoEncoder(kind="feedforward_hourglass")), ] ), # MinMax initialized with tuple Pipeline( [ ("pca1", PCA(n_components=2)), ( "fu", FeatureUnion( [ ("pca2", PCA(n_components=3)), ( "pipe", Pipeline( [ ("minmax", MinMaxScaler((0, 1))), ("truncsvd", TruncatedSVD(n_components=2)), ] ), ), ] ), ), ("ae", KerasAutoEncoder(kind="feedforward_hourglass")), ] ), # First pipeline without explicit steps param, other with. Pipeline( [ ("pca1", PCA(n_components=2)), ( "fu", FeatureUnion( [ ("pca2", PCA(n_components=3)), ( "pipe", Pipeline( steps=[ ("minmax", MinMaxScaler((0, 1))), ("truncsvd", TruncatedSVD(n_components=2)), ] ), ), ] ), ), ("ae", KerasAutoEncoder(kind="feedforward_hourglass")), ] ), ]
def test_pipeline_serialization(self): pipe = Pipeline([ ("pca1", PCA(n_components=10)), ( "fu", FeatureUnion([ ("pca2", PCA(n_components=3)), ( "pipe", Pipeline([ ("minmax", MinMaxScaler()), ("truncsvd", TruncatedSVD(n_components=7)), ]), ), ]), ), ("ae", KerasAutoEncoder(kind="feedforward_hourglass")), ]) X = np.random.random(size=100).reshape(10, 10) pipe.fit(X.copy(), X.copy()) with TemporaryDirectory() as tmp: # Test dump metadata = {"key": "value"} serializer.dump(pipe, tmp, metadata=metadata) # Assert that a dirs are created for each step in Pipeline expected_structure = OrderedDict([ ("n_step=000-class=sklearn.pipeline.Pipeline", "metadata.json"), ( "n_step=000-class=sklearn.pipeline.Pipeline", OrderedDict([ ( "n_step=000-class=sklearn.decomposition.pca.PCA", "pca1.pkl.gz", ), ( "n_step=001-class=sklearn.pipeline.FeatureUnion", "params.json", ), ( "n_step=001-class=sklearn.pipeline.FeatureUnion", OrderedDict([ ( "n_step=000-class=sklearn.decomposition.pca.PCA", "pca2.pkl.gz", ), ( "n_step=001-class=sklearn.pipeline.Pipeline", OrderedDict([ ( "n_step=000-class=sklearn.preprocessing.data.MinMaxScaler", "minmax.pkl.gz", ), ( "n_step=001-class=sklearn.decomposition.truncated_svd.TruncatedSVD", "truncsvd.pkl.gz", ), ]), ), ]), ), ( "n_step=002-class=gordo_components.model.models.KerasAutoEncoder", "model.h5", ), ( "n_step=002-class=gordo_components.model.models.KerasAutoEncoder", "params.json", ), ]), ), ]) self._structure_verifier(prefix_dir=tmp, structure=expected_structure) # Test load from the serialized pipeline above pipe_clone = serializer.load(tmp) metadata_clone = serializer.load_metadata(tmp) # Ensure the metadata was saved and loaded back self.assertEqual(metadata, metadata_clone) # Verify same state for both pipelines y_hat_pipe1 = pipe.predict(X.copy()).flatten() y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten() self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2)) # Now use dumps/loads serialized = serializer.dumps(pipe) pipe_clone = serializer.loads(serialized) # Verify same state for both pipelines y_hat_pipe1 = pipe.predict(X.copy()).flatten() y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten() self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2))
self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2)) # Now use dumps/loads serialized = serializer.dumps(pipe) pipe_clone = serializer.loads(serialized) # Verify same state for both pipelines y_hat_pipe1 = pipe.predict(X.copy()).flatten() y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten() self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2)) @pytest.mark.parametrize( "model", [ KerasAutoEncoder(kind="feedforward_hourglass"), DiffBasedAnomalyDetector(base_estimator=TransformedTargetRegressor( regressor=KerasAutoEncoder(kind="feedforward_symmetric"), transformer=MinMaxScaler(), )), TransformedTargetRegressor(regressor=Pipeline(steps=[ ("stp1", MinMaxScaler()), ("stp2", KerasAutoEncoder(kind="feedforward_symmetric")), ])), ], ) def test_dump_load_models(model): X = np.random.random(size=100).reshape(10, 10) model.fit(X.copy(), X.copy()) model_out = model.predict(X.copy())