Пример #1
0
def test_pipe_summary():
    """
    Test summary
    """

    reg_ab = LinearRegression()
    reg_ac = LinearRegression()
    my_pipe = Pipenet(
        {
            "A-B-regression-ad": {
                "model": detector.RegressionAD(regressor=reg_ab, target="B"),
                "input": "original",
                "subset": ["A", "B"],
            },
            "A-C-regression-error": {
                "model": transformer.RegressionResidual(
                    regressor=reg_ac, target="C"
                ),
                "input": "original",
                "subset": ["A", "C"],
            },
            "A-C-regression-ad": {
                "model": detector.InterQuartileRangeAD(),
                "input": "A-C-regression-error",
                "subset": "all",
            },
            "ABC-ad": {
                "model": aggregator.OrAggregator(),
                "input": ["A-B-regression-ad", "A-C-regression-ad"],
            },
            "D-ad": {
                "model": detector.QuantileAD(high=0.9, low=0.1),
                "input": "original",
                "subset": ["D"],
            },
            "ABCD-ad": {
                "model": aggregator.OrAggregator(),
                "input": ["ABC-ad", "D-ad"],
            },
        }
    )
    my_pipe.summary()
Пример #2
0
one2many_models = [
    transformer.RollingAggregate(
        agg="quantile", agg_params={"q": [0.1, 0.5, 0.9]}
    ),
    transformer.RollingAggregate(
        agg="hist", agg_params={"bins": [20, 50, 80]}
    ),
    transformer.Retrospect(n_steps=3),
]

many2one_models = [
    detector.MinClusterDetector(KMeans(n_clusters=2)),
    detector.OutlierDetector(
        LocalOutlierFactor(n_neighbors=20, contamination=0.1)
    ),
    detector.RegressionAD(regressor=LinearRegression()),
    detector.PcaAD(),
    transformer.SumAll(),
    transformer.RegressionResidual(LinearRegression()),
    transformer.PcaReconstructionError(),
]


@pytest.mark.parametrize("model", one2one_models)
def test_one2one_s2s_w_name(model):
    """
    if a one-to-one model is applied to a Series, it should keep the Series
    name unchanged
    """
    s_name = pd.Series(
        np.arange(100),
Пример #3
0
def test_skip_fit():
    reg_ab = LinearRegression()
    reg_ac = LinearRegression()
    my_pipe = Pipenet(
        {
            "A-B-regression-ad": {
                "model": detector.RegressionAD(regressor=reg_ab, target="B"),
                "input": "original",
                "subset": ["A", "B"],
            },
            "A-C-regression-error": {
                "model": transformer.RegressionResidual(
                    regressor=reg_ac, target="C"
                ),
                "input": "original",
                "subset": ["A", "C"],
            },
            "A-C-regression-ad": {
                "model": detector.InterQuartileRangeAD(),
                "input": "A-C-regression-error",
                "subset": "all",
            },
            "ABC-ad": {
                "model": aggregator.OrAggregator(),
                "input": ["A-B-regression-ad", "A-C-regression-ad"],
            },
            "D-ad": {
                "model": detector.QuantileAD(high=0.9, low=0.1),
                "input": "original",
                "subset": ["D"],
            },
            "ABCD-ad": {
                "model": aggregator.OrAggregator(),
                "input": ["ABC-ad", "D-ad"],
            },
        }
    )

    df = pd.DataFrame(
        np.array(
            [
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                [0, 10, 20, 30, 40, 50, 60, 70, 80, 90],
                [0, 100, 200, 300, 400, 500, 600, 700, 800, 900],
                [0, 0, 0, 0, 0, 0, 0, 100, 0, 0],
            ]
        ).T,
        index=pd.date_range(start="2017-1-1", periods=10, freq="D"),
        columns=["A", "B", "C", "D"],
    )
    my_pipe.fit(df)

    df = pd.DataFrame(
        np.array(
            [
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                [0, 10, 20, 30, 41, 50, 60, 70, 80, 90],
                [0, 100, 200, 300, 400, 500, 601, 700, 800, 900],
                [0, 0, 0, 0, 0, 0, 0, 100, 0, 0],
            ]
        ).T,
        index=pd.date_range(start="2017-1-1", periods=10, freq="D"),
        columns=["A", "B", "C", "D"],
    )
    my_pipe.fit(df, skip_fit=["A-B-regression-ad", "A-C-regression-error"])
    assert reg_ab.coef_[0] == pytest.approx(10)
    assert reg_ac.coef_[0] == pytest.approx(100)
    assert my_pipe.steps["A-C-regression-ad"]["model"].abs_high_ == 0
    assert my_pipe.steps["A-C-regression-ad"]["model"].abs_low_ == 0

    my_pipe.fit(df, skip_fit=["A-B-regression-ad"])
    assert reg_ab.coef_[0] == pytest.approx(10)
    assert reg_ac.coef_[0] != pytest.approx(100)
    assert my_pipe.steps["A-C-regression-ad"]["model"].abs_high_ != 0
    assert my_pipe.steps["A-C-regression-ad"]["model"].abs_low_ != 0
Пример #4
0
def test_pipenet_return_list_return_intermediate():
    """
    Test pipenet with return_list=True and return_intermediate=True
    """
    df = pd.DataFrame(
        np.array(
            [
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                [0, 10, 20, 30, 41, 50, 60, 70, 80, 90],
                [0, 100, 200, 300, 400, 500, 601, 700, 800, 900],
                [0, 0, 0, 0, 0, 0, 0, 100, 0, 0],
            ]
        ).T,
        index=pd.date_range(start="2017-1-1", periods=10, freq="D"),
        columns=["A", "B", "C", "D"],
    )

    reg_ab = LinearRegression()
    reg_ac = LinearRegression()
    my_pipe = Pipenet(
        {
            "A-B-regression-ad": {
                "model": detector.RegressionAD(regressor=reg_ab, target="B"),
                "input": "original",
                "subset": ["A", "B"],
            },
            "A-C-regression-error": {
                "model": transformer.RegressionResidual(
                    regressor=reg_ac, target="C"
                ),
                "input": "original",
                "subset": ["A", "C"],
            },
            "A-C-regression-ad": {
                "model": detector.InterQuartileRangeAD(),
                "input": "A-C-regression-error",
                "subset": "all",
            },
            "ABC-ad": {
                "model": aggregator.OrAggregator(),
                "input": ["A-B-regression-ad", "A-C-regression-ad"],
            },
            "D-ad": {
                "model": detector.QuantileAD(high=0.9, low=0.1),
                "input": "original",
                "subset": ["D"],
            },
            "ABCD-ad": {
                "model": aggregator.OrAggregator(),
                "input": ["ABC-ad", "D-ad"],
            },
        }
    )

    results = my_pipe.fit_detect(
        df, return_list=True, return_intermediate=True
    )
    assert set(results.keys()) == set(my_pipe.steps.keys()).union({"original"})
    assert results["A-B-regression-ad"] == [
        (
            pd.Timestamp("2017-01-05 00:00:00"),
            pd.Timestamp("2017-01-05 23:59:59.999999999"),
        )
    ]
    assert results["A-C-regression-ad"] == [
        (
            pd.Timestamp("2017-01-07 00:00:00"),
            pd.Timestamp("2017-01-07 23:59:59.999999999"),
        )
    ]
    assert results["ABC-ad"] == [
        (
            pd.Timestamp("2017-01-05 00:00:00"),
            pd.Timestamp("2017-01-05 23:59:59.999999999"),
        ),
        (
            pd.Timestamp("2017-01-07 00:00:00"),
            pd.Timestamp("2017-01-07 23:59:59.999999999"),
        ),
    ]
    assert results["D-ad"] == [
        (
            pd.Timestamp("2017-01-08 00:00:00"),
            pd.Timestamp("2017-01-08 23:59:59.999999999"),
        )
    ]
    assert results["ABCD-ad"] == [
        (
            pd.Timestamp("2017-01-05 00:00:00"),
            pd.Timestamp("2017-01-05 23:59:59.999999999"),
        ),
        (
            pd.Timestamp("2017-01-07 00:00:00"),
            pd.Timestamp("2017-01-08 23:59:59.999999999"),
        ),
    ]
Пример #5
0
def test_pipenet_return_intermediate():
    """
    Test pipenet with return_intermediate=True
    """
    df = pd.DataFrame(
        np.array(
            [
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                [0, 10, 20, 30, 41, 50, 60, 70, 80, 90],
                [0, 100, 200, 300, 400, 500, 601, 700, 800, 900],
                [0, 0, 0, 0, 0, 0, 0, 100, 0, 0],
            ]
        ).T,
        index=pd.date_range(start="2017-1-1", periods=10, freq="D"),
        columns=["A", "B", "C", "D"],
    )

    reg_ab = LinearRegression()
    reg_ac = LinearRegression()
    my_pipe = Pipenet(
        {
            "A-B-regression-ad": {
                "model": detector.RegressionAD(regressor=reg_ab, target="B"),
                "input": "original",
                "subset": ["A", "B"],
            },
            "A-C-regression-error": {
                "model": transformer.RegressionResidual(
                    regressor=reg_ac, target="C"
                ),
                "input": "original",
                "subset": ["A", "C"],
            },
            "A-C-regression-ad": {
                "model": detector.InterQuartileRangeAD(),
                "input": "A-C-regression-error",
                "subset": "all",
            },
            "ABC-ad": {
                "model": aggregator.OrAggregator(),
                "input": ["A-B-regression-ad", "A-C-regression-ad"],
            },
            "D-ad": {
                "model": detector.QuantileAD(high=0.9, low=0.1),
                "input": "original",
                "subset": ["D"],
            },
            "ABCD-ad": {
                "model": aggregator.OrAggregator(),
                "input": ["ABC-ad", "D-ad"],
            },
        }
    )

    results = my_pipe.fit(df, return_intermediate=True)
    assert set(results.keys()) == set(my_pipe.steps.keys()).union({"original"})
    assert results["A-B-regression-ad"] is None
    assert results["A-C-regression-error"] is not None
    assert results["A-C-regression-ad"] is None
    assert results["ABC-ad"] is None
    assert results["D-ad"] is None
    assert results["ABCD-ad"] is None

    results = my_pipe.fit_detect(df, return_intermediate=True)
    assert set(results.keys()) == set(my_pipe.steps.keys()).union({"original"})
    pd.testing.assert_series_equal(
        results["A-B-regression-ad"],
        pd.Series([0, 0, 0, 0, 1, 0, 0, 0, 0, 0], index=df.index),
        check_dtype=False,
        check_names=False,
    )
    pd.testing.assert_series_equal(
        results["A-C-regression-ad"],
        pd.Series([0, 0, 0, 0, 0, 0, 1, 0, 0, 0], index=df.index),
        check_dtype=False,
        check_names=False,
    )
    pd.testing.assert_series_equal(
        results["ABC-ad"],
        pd.Series([0, 0, 0, 0, 1, 0, 1, 0, 0, 0], index=df.index),
        check_dtype=False,
        check_names=False,
    )
    pd.testing.assert_series_equal(
        results["D-ad"],
        pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 0, 0], index=df.index),
        check_dtype=False,
        check_names=False,
    )
    pd.testing.assert_series_equal(
        results["ABCD-ad"],
        pd.Series([0, 0, 0, 0, 1, 0, 1, 1, 0, 0], index=df.index),
        check_dtype=False,
        check_names=False,
    )
Пример #6
0
def test_pipenet_default():
    """
    Test default setting of pipenet
    """
    df = pd.DataFrame(
        np.array(
            [
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                [0, 10, 20, 30, 41, 50, 60, 70, 80, 90],
                [0, 100, 200, 300, 400, 500, 601, 700, 800, 900],
                [0, 0, 0, 0, 0, 0, 0, 100, 0, 0],
            ]
        ).T,
        index=pd.date_range(start="2017-1-1", periods=10, freq="D"),
        columns=["A", "B", "C", "D"],
    )

    reg_ab = LinearRegression()
    reg_ac = LinearRegression()
    my_pipe = Pipenet(
        {
            "A-B-regression-ad": {
                "model": detector.RegressionAD(regressor=reg_ab, target="B"),
                "input": "original",
                "subset": ["A", "B"],
            },
            "A-C-regression-error": {
                "model": transformer.RegressionResidual(
                    regressor=reg_ac, target="C"
                ),
                "input": "original",
                "subset": ["A", "C"],
            },
            "A-C-regression-ad": {
                "model": detector.InterQuartileRangeAD(),
                "input": "A-C-regression-error",
                "subset": "all",
            },
            "ABC-ad": {
                "model": aggregator.OrAggregator(),
                "input": ["A-B-regression-ad", "A-C-regression-ad"],
            },
            "D-ad": {
                "model": detector.QuantileAD(high=0.9, low=0.1),
                "input": "original",
                "subset": ["D"],
            },
            "ABCD-ad": {
                "model": aggregator.OrAggregator(),
                "input": ["ABC-ad", "D-ad"],
            },
        }
    )

    anomaly = my_pipe.fit_detect(df)
    pd.testing.assert_series_equal(
        anomaly,
        pd.Series([0, 0, 0, 0, 1, 0, 1, 1, 0, 0], index=df.index),
        check_dtype=False,
    )

    assert (
        my_pipe.score(
            df,
            pd.Series([0, 0, 0, 0, 1, 0, 1, 1, 0, 0], index=df.index),
            scoring="recall",
        )
        == 1
    )
    assert (
        my_pipe.score(
            df,
            pd.Series([0, 0, 0, 0, 1, 0, 1, 1, 0, 0], index=df.index),
            scoring="precision",
        )
        == 1
    )
    assert (
        my_pipe.score(
            df,
            pd.Series([0, 0, 0, 0, 1, 0, 1, 1, 0, 0], index=df.index),
            scoring="iou",
        )
        == 1
    )
    assert (
        my_pipe.score(
            df,
            pd.Series([0, 0, 0, 0, 1, 0, 1, 1, 0, 0], index=df.index),
            scoring="f1",
        )
        == 1
    )
import numpy as np
import pandas as pd
import pytest
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor

import adtk.detector as detector
import adtk.transformer as transformer

models = [
    detector.MinClusterDetector(KMeans(n_clusters=2)),
    detector.OutlierDetector(
        LocalOutlierFactor(n_neighbors=20, contamination=0.1)),
    detector.RegressionAD(target="A", regressor=LinearRegression()),
    detector.PcaAD(),
    transformer.RegressionResidual(target="A", regressor=LinearRegression()),
    transformer.PcaReconstructionError(),
    transformer.PcaProjection(),
    transformer.PcaReconstruction(),
]

df_train = pd.DataFrame(
    np.arange(40).reshape(20, 2),
    columns=["A", "B"],
    index=pd.date_range(start="2017-1-1", periods=20, freq="D"),
)

df_test_ok = pd.DataFrame(
    np.arange(0, -60, -1).reshape(20, 3),