def test_min_n_obs_shrinkage_too_little_obs(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df["Target"] too_big_n_obs = X.shape[0] + 1 shrink_est = GroupedEstimator( DummyRegressor(), ["Planet", "Country", "City"], shrinkage="min_n_obs", use_global_model=False, min_n_obs=too_big_n_obs, ) with pytest.raises(ValueError) as e: shrink_est.fit(X, y) assert ( f"There is no group with size greater than or equal to {too_big_n_obs}" in str(e))
def test_global_model_shrinkage(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df['Target'] shrink_est_without_global = GroupedEstimator(DummyRegressor(), ["Planet", 'Country', 'City'], shrinkage="min_n_obs", use_global_model=False, min_n_obs=2) shrink_est_with_global = GroupedEstimator(DummyRegressor(), ['Country', 'City'], value_columns=[], shrinkage="min_n_obs", use_global_model=True, min_n_obs=2) shrink_est_without_global.fit(X, y) shrink_est_with_global.fit(X, y) pd.testing.assert_series_equal(shrink_est_with_global.predict(X), shrink_est_without_global.predict(X))
def test_shrinkage_single_group(shrinkage_data): df, means = shrinkage_data X, y = df.drop(columns="Target"), df['Target'] shrink_est = GroupedEstimator(DummyRegressor(), 'Country', value_columns=[], shrinkage="constant", use_global_model=True, alpha=0.1) shrinkage_factors = np.array([0.1, 0.9]) shrink_est.fit(X, y) expected_prediction = [ np.array([means["Earth"], means["NL"]]) @ shrinkage_factors, np.array([means["Earth"], means["NL"]]) @ shrinkage_factors, np.array([means["Earth"], means["BE"]]) @ shrinkage_factors, np.array([means["Earth"], means["BE"]]) @ shrinkage_factors, ] assert expected_prediction == shrink_est.predict(X).tolist()
def test_chickweight_np_keys(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups=[1, 2]) mod.fit(df[['time', 'chick', 'diet']].values, df['weight'].values) # there should still only be 50 groups on this dataset assert len(mod.estimators_.keys()) == 50
def test_chickweight_raise_error_cols_missing2(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="diet") mod.fit(df[['time', 'diet']], df['weight']) with pytest.raises(ValueError): mod.predict(df[['diet', 'chick']])
def test_chickweight_df2_keys(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="chick") mod.fit(df[['time', 'chick']], df['weight']) assert set(mod.estimators_.keys()) == set(range(1, 50 + 1))
def test_chickweight_df1_keys(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="diet") mod.fit(df[['time', 'diet']], df['weight']) assert set(mod.estimators_.keys()) == {1, 2, 3, 4}
def test_chickweight_df2_keys(): df = load_chicken(as_frame=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="chick") mod.fit(df[["time", "chick"]], df["weight"]) assert set(mod.estimators_.keys()) == set(range(1, 50 + 1))
def test_chickweight_df1_keys(): df = load_chicken(as_frame=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="diet") mod.fit(df[["time", "diet"]], df["weight"]) assert set(mod.estimators_.keys()) == {1, 2, 3, 4}
def test_chickweight_np_keys(): df = load_chicken(as_frame=True) mod = GroupedEstimator(estimator=LinearRegression(), groups=[1, 2]) mod.fit(df[["time", "chick", "diet"]].values, df["weight"].values) # there should still only be 50 groups on this dataset assert len(mod.estimators_.keys()) == 50