def test_pmdarima_constancy_validation(data): constancy = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds").calculate_is_constant() assert len(constancy) == SERIES_TEST_COUNT for value in constancy.values(): assert not value
def test_pmdarima_calculate_pacf_minimal_args(data): pacf_data = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds").calculate_pacf() for payload in pacf_data.values(): assert {"pacf"}.issubset(payload.keys()) assert [key not in payload.keys() for key in ["confidence_intervals"]] assert len(payload.get("pacf")) == 32
def test_pmdarima_utils_nsdiffs_calculation(data): nsdiffs = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds").calculate_nsdiffs( m=7, test="ocsb", max_D=7) assert len(nsdiffs) == SERIES_TEST_COUNT for k, v in nsdiffs.items(): assert isinstance(k, tuple) assert v <= 7
def test_pmdarima_generate_diff(data): diff = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds").generate_diff(lag=2, differences=1) for data in diff.values(): assert len(data["diff"]) == (365 * 4) - 2 assert data["series_start"] > 0 assert isinstance(data["series_start"], float)
def test_pmdarima_calculate_pacf_full_args(data): pacf_data = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds").calculate_pacf(nlags=90, method="yw", alpha=0.05) for payload in pacf_data.values(): assert {"pacf", "confidence_intervals"}.issubset(payload.keys()) assert len(payload.get("pacf")) == 91 assert len(payload.get("confidence_intervals")) == 91
def pipeline_override_d(data): pipeline = Pipeline(steps=[("arima", AutoARIMA(out_of_sample_size=30))]) util = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds") ndiffs = util.calculate_ndiffs(alpha=0.2, test="kpss", max_d=7) nsdiffs = util.calculate_nsdiffs(m=7, test="ocsb", max_D=7) return GroupedPmdarima(pipeline).fit( df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds", ndiffs=ndiffs, nsdiffs=nsdiffs, silence_warnings=True, )
def test_pmdarima_stationarity_optimized_overrides(data, pipeline_override_d): ndiffs = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds").calculate_ndiffs(alpha=0.5, test="kpss", max_d=7) params = pipeline_override_d.get_model_params() for _, row in params.iterrows(): group = (row["key1"], row["key0"]) assert ndiffs.get(group) == row["d"] assert ( row["D"] == 0 ) # this isn't a seasonal model so the override shouldn't populate for 'D'
def test_pmdarima_diff_inv_fails_with_invalid_data(data): analyzer = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds") diff = analyzer.generate_diff(lag=1, differences=1) with pytest.raises( DivinerException, match="group_diff_data does not contain the key `diff`"): diff_mod = {} for key, value in diff.items(): diff_mod[key] = {"series_start": value.get("series_start")} analyzer.generate_diff_inversion(group_diff_data=diff_mod, lag=1, differences=1, recenter=True) with pytest.warns( UserWarning, match="Recentering is not possible due to `series_start` missing"): diff_mod = {} for key, value in diff.items(): diff_mod[key] = {"diff": value.get("diff")} analyzer.generate_diff_inversion(group_diff_data=diff_mod, lag=1, differences=1, recenter=True)
def test_pmdarima_reconstruct_series_from_diff_inv(data): analyzer = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds") diff = analyzer.generate_diff(lag=2, differences=1) group_dfs = analyzer._group_df inverted = analyzer.generate_diff_inversion(diff, lag=2, differences=1, recenter=True) for group, data in group_dfs: assert_allclose(data["y"], inverted.get(group), rtol=0.1)
def test_pmdarima_calculate_acf_minimal_args(data): acf_data = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds").calculate_acf( unbiased=False, nlags=90, qstat=False, fft=False, alpha=None) for payload in acf_data.values(): assert {"acf"}.issubset(payload.keys()) assert [ key not in payload.keys() for key in ["qstat", "pvalues", "confidence_intervals"] ] assert len(payload.get("acf")) == 91
def test_pmdarima_calculate_acf_full_args(data): acf_data = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds").calculate_acf(unbiased=True, nlags=90, qstat=True, fft=True, alpha=0.1) for payload in acf_data.values(): assert {"acf", "qstat", "pvalues", "confidence_intervals"}.issubset(payload.keys()) assert len(payload.get("acf")) == 91 assert len(payload.get("qstat")) == 90 assert len(payload.get("confidence_intervals")) == 91 assert len(payload.get("pvalues")) == 90
def test_pmdarima_utils_trend_decomposition(data, type_): decomposed = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds").decompose_groups( m=7, type_=type_) for col in { "x", "trend", "seasonal", "random", "ds", "key1", "key0", "grouping_key_columns", }: assert col in decomposed.columns assert len(decomposed) == len(data.df)
def test_pmdarima_ndiffs_override_class_args(data): ndiffs = PmdarimaAnalyzer(df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds").calculate_ndiffs(alpha=0.4, max_d=4) base_template = AutoARIMA(d=10, out_of_sample_size=7) model = GroupedPmdarima(base_template).fit( df=data.df, group_key_columns=data.key_columns, y_col="y", datetime_col="ds", ndiffs=ndiffs, silence_warnings=True, ) params = model.get_model_params() for _, row in params.iterrows(): assert row["d"] <= 4
steps=[ ( "arima", AutoARIMA( max_order=14, out_of_sample_size=90, suppress_warnings=True, error_action="ignore", ), ) ] ) diff_analyzer = PmdarimaAnalyzer( df=training_data, group_key_columns=group_key_columns, y_col="y", datetime_col="ds", ) ndiff = diff_analyzer.calculate_ndiffs( alpha=0.05, test="kpss", max_d=4, ) grouped_model = GroupedPmdarima(model_template=pipeline).fit( df=training_data, group_key_columns=group_key_columns, y_col="y", datetime_col="ds", ndiffs=ndiff, silence_warnings=True,
generated_data = generate_example_data( column_count=4, series_count=3, series_size=365 * 12, start_dt="2010-01-01", days_period=1, ) training_data = generated_data.df group_key_columns = generated_data.key_columns # Create a utility object for performing analyses # We reuse this object because the grouped data set collection is lazily evaluated and can be # reused for subsequent analytics operations on the data set. analyzer = PmdarimaAnalyzer( df=training_data, group_key_columns=group_key_columns, y_col="y", datetime_col="ds", ) # Decompose the trends of each group decomposed_trends = analyzer.decompose_groups(m=7, type_="additive") print("Decomposed trend data for the groups") print("-" * 100, "\n") print(decomposed_trends[:50].to_string()) # Calculate optimal differencing for ARMA terms ndiffs = analyzer.calculate_ndiffs(alpha=0.1, test="kpss", max_d=5) _print_dict(ndiffs, "Differencing")