예제 #1
0
def test_pmdarima_constancy_validation(data):

    constancy = PmdarimaAnalyzer(df=data.df,
                                 group_key_columns=data.key_columns,
                                 y_col="y",
                                 datetime_col="ds").calculate_is_constant()

    assert len(constancy) == SERIES_TEST_COUNT
    for value in constancy.values():
        assert not value
예제 #2
0
def test_pmdarima_calculate_pacf_minimal_args(data):

    pacf_data = PmdarimaAnalyzer(df=data.df,
                                 group_key_columns=data.key_columns,
                                 y_col="y",
                                 datetime_col="ds").calculate_pacf()

    for payload in pacf_data.values():
        assert {"pacf"}.issubset(payload.keys())
        assert [key not in payload.keys() for key in ["confidence_intervals"]]
        assert len(payload.get("pacf")) == 32
예제 #3
0
def test_pmdarima_utils_nsdiffs_calculation(data):

    nsdiffs = PmdarimaAnalyzer(df=data.df,
                               group_key_columns=data.key_columns,
                               y_col="y",
                               datetime_col="ds").calculate_nsdiffs(
                                   m=7, test="ocsb", max_D=7)
    assert len(nsdiffs) == SERIES_TEST_COUNT
    for k, v in nsdiffs.items():
        assert isinstance(k, tuple)
        assert v <= 7
예제 #4
0
def test_pmdarima_generate_diff(data):

    diff = PmdarimaAnalyzer(df=data.df,
                            group_key_columns=data.key_columns,
                            y_col="y",
                            datetime_col="ds").generate_diff(lag=2,
                                                             differences=1)

    for data in diff.values():
        assert len(data["diff"]) == (365 * 4) - 2
        assert data["series_start"] > 0
        assert isinstance(data["series_start"], float)
예제 #5
0
def test_pmdarima_calculate_pacf_full_args(data):

    pacf_data = PmdarimaAnalyzer(df=data.df,
                                 group_key_columns=data.key_columns,
                                 y_col="y",
                                 datetime_col="ds").calculate_pacf(nlags=90,
                                                                   method="yw",
                                                                   alpha=0.05)

    for payload in pacf_data.values():
        assert {"pacf", "confidence_intervals"}.issubset(payload.keys())
        assert len(payload.get("pacf")) == 91
        assert len(payload.get("confidence_intervals")) == 91
예제 #6
0
def pipeline_override_d(data):
    pipeline = Pipeline(steps=[("arima", AutoARIMA(out_of_sample_size=30))])
    util = PmdarimaAnalyzer(df=data.df,
                            group_key_columns=data.key_columns,
                            y_col="y",
                            datetime_col="ds")
    ndiffs = util.calculate_ndiffs(alpha=0.2, test="kpss", max_d=7)
    nsdiffs = util.calculate_nsdiffs(m=7, test="ocsb", max_D=7)
    return GroupedPmdarima(pipeline).fit(
        df=data.df,
        group_key_columns=data.key_columns,
        y_col="y",
        datetime_col="ds",
        ndiffs=ndiffs,
        nsdiffs=nsdiffs,
        silence_warnings=True,
    )
예제 #7
0
def test_pmdarima_stationarity_optimized_overrides(data, pipeline_override_d):

    ndiffs = PmdarimaAnalyzer(df=data.df,
                              group_key_columns=data.key_columns,
                              y_col="y",
                              datetime_col="ds").calculate_ndiffs(alpha=0.5,
                                                                  test="kpss",
                                                                  max_d=7)

    params = pipeline_override_d.get_model_params()

    for _, row in params.iterrows():
        group = (row["key1"], row["key0"])
        assert ndiffs.get(group) == row["d"]
        assert (
            row["D"] == 0
        )  # this isn't a seasonal model so the override shouldn't populate for 'D'
예제 #8
0
def test_pmdarima_diff_inv_fails_with_invalid_data(data):

    analyzer = PmdarimaAnalyzer(df=data.df,
                                group_key_columns=data.key_columns,
                                y_col="y",
                                datetime_col="ds")
    diff = analyzer.generate_diff(lag=1, differences=1)

    with pytest.raises(
            DivinerException,
            match="group_diff_data does not contain the key `diff`"):
        diff_mod = {}
        for key, value in diff.items():
            diff_mod[key] = {"series_start": value.get("series_start")}
        analyzer.generate_diff_inversion(group_diff_data=diff_mod,
                                         lag=1,
                                         differences=1,
                                         recenter=True)

    with pytest.warns(
            UserWarning,
            match="Recentering is not possible due to `series_start` missing"):
        diff_mod = {}
        for key, value in diff.items():
            diff_mod[key] = {"diff": value.get("diff")}
        analyzer.generate_diff_inversion(group_diff_data=diff_mod,
                                         lag=1,
                                         differences=1,
                                         recenter=True)
예제 #9
0
def test_pmdarima_reconstruct_series_from_diff_inv(data):

    analyzer = PmdarimaAnalyzer(df=data.df,
                                group_key_columns=data.key_columns,
                                y_col="y",
                                datetime_col="ds")
    diff = analyzer.generate_diff(lag=2, differences=1)

    group_dfs = analyzer._group_df

    inverted = analyzer.generate_diff_inversion(diff,
                                                lag=2,
                                                differences=1,
                                                recenter=True)

    for group, data in group_dfs:

        assert_allclose(data["y"], inverted.get(group), rtol=0.1)
예제 #10
0
def test_pmdarima_calculate_acf_minimal_args(data):

    acf_data = PmdarimaAnalyzer(df=data.df,
                                group_key_columns=data.key_columns,
                                y_col="y",
                                datetime_col="ds").calculate_acf(
                                    unbiased=False,
                                    nlags=90,
                                    qstat=False,
                                    fft=False,
                                    alpha=None)
    for payload in acf_data.values():
        assert {"acf"}.issubset(payload.keys())
        assert [
            key not in payload.keys()
            for key in ["qstat", "pvalues", "confidence_intervals"]
        ]
        assert len(payload.get("acf")) == 91
예제 #11
0
def test_pmdarima_calculate_acf_full_args(data):

    acf_data = PmdarimaAnalyzer(df=data.df,
                                group_key_columns=data.key_columns,
                                y_col="y",
                                datetime_col="ds").calculate_acf(unbiased=True,
                                                                 nlags=90,
                                                                 qstat=True,
                                                                 fft=True,
                                                                 alpha=0.1)

    for payload in acf_data.values():
        assert {"acf", "qstat", "pvalues",
                "confidence_intervals"}.issubset(payload.keys())
        assert len(payload.get("acf")) == 91
        assert len(payload.get("qstat")) == 90
        assert len(payload.get("confidence_intervals")) == 91
        assert len(payload.get("pvalues")) == 90
예제 #12
0
def test_pmdarima_utils_trend_decomposition(data, type_):

    decomposed = PmdarimaAnalyzer(df=data.df,
                                  group_key_columns=data.key_columns,
                                  y_col="y",
                                  datetime_col="ds").decompose_groups(
                                      m=7, type_=type_)
    for col in {
            "x",
            "trend",
            "seasonal",
            "random",
            "ds",
            "key1",
            "key0",
            "grouping_key_columns",
    }:
        assert col in decomposed.columns
    assert len(decomposed) == len(data.df)
예제 #13
0
def test_pmdarima_ndiffs_override_class_args(data):

    ndiffs = PmdarimaAnalyzer(df=data.df,
                              group_key_columns=data.key_columns,
                              y_col="y",
                              datetime_col="ds").calculate_ndiffs(alpha=0.4,
                                                                  max_d=4)

    base_template = AutoARIMA(d=10, out_of_sample_size=7)

    model = GroupedPmdarima(base_template).fit(
        df=data.df,
        group_key_columns=data.key_columns,
        y_col="y",
        datetime_col="ds",
        ndiffs=ndiffs,
        silence_warnings=True,
    )

    params = model.get_model_params()

    for _, row in params.iterrows():
        assert row["d"] <= 4
        steps=[
            (
                "arima",
                AutoARIMA(
                    max_order=14,
                    out_of_sample_size=90,
                    suppress_warnings=True,
                    error_action="ignore",
                ),
            )
        ]
    )

    diff_analyzer = PmdarimaAnalyzer(
        df=training_data,
        group_key_columns=group_key_columns,
        y_col="y",
        datetime_col="ds",
    )
    ndiff = diff_analyzer.calculate_ndiffs(
        alpha=0.05,
        test="kpss",
        max_d=4,
    )

    grouped_model = GroupedPmdarima(model_template=pipeline).fit(
        df=training_data,
        group_key_columns=group_key_columns,
        y_col="y",
        datetime_col="ds",
        ndiffs=ndiff,
        silence_warnings=True,
    generated_data = generate_example_data(
        column_count=4,
        series_count=3,
        series_size=365 * 12,
        start_dt="2010-01-01",
        days_period=1,
    )
    training_data = generated_data.df
    group_key_columns = generated_data.key_columns

    # Create a utility object for performing analyses
    # We reuse this object because the grouped data set collection is lazily evaluated and can be
    # reused for subsequent analytics operations on the data set.
    analyzer = PmdarimaAnalyzer(
        df=training_data,
        group_key_columns=group_key_columns,
        y_col="y",
        datetime_col="ds",
    )

    # Decompose the trends of each group
    decomposed_trends = analyzer.decompose_groups(m=7, type_="additive")

    print("Decomposed trend data for the groups")
    print("-" * 100, "\n")
    print(decomposed_trends[:50].to_string())

    # Calculate optimal differencing for ARMA terms
    ndiffs = analyzer.calculate_ndiffs(alpha=0.1, test="kpss", max_d=5)

    _print_dict(ndiffs, "Differencing")