def test_chickweight_raise_error_group_col_missing(): df = load_chicken(as_frame=True) mod = GroupedPredictor(estimator=LinearRegression(), groups="diet") mod.fit(df[["time", "diet"]], df["weight"]) with pytest.raises(ValueError) as e: mod.predict(df[["time", "chick"]]) assert "not in columns" in str(e)
def test_chickweight_raise_error_cols_missing2(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="diet") mod.fit(df[['time', 'diet']], df['weight']) with pytest.raises(ValueError) as e: mod.predict(df[['diet', 'chick']]) assert "not in columns" in str(e)
def test_bad_shrinkage_value_error(): with pytest.raises(ValueError) as e: df = load_chicken(as_frame=True) mod = GroupedPredictor( estimator=LinearRegression(), groups="diet", shrinkage="dinosaurhead" ) mod.fit(df[["time", "diet"]], df["weight"]) assert "shrinkage function" in str(e)
def test_has_decision_function(): # needed as for example cross_val_score(pipe, X, y, cv=5, scoring="roc_auc", error_score='raise') may fail otherwise, see https://github.com/koaning/scikit-lego/issues/511 df = load_chicken(as_frame=True) X, y = df.drop(columns='weight'), df['weight'] # This should NOT raise errors GroupedPredictor(LogisticRegression(), groups=["diet"]).fit(X, y).decision_function(X)
def test_chickweight_can_do_fallback(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="diet") mod.fit(df[['time', 'diet']], df['weight']) assert set(mod.estimators_.keys()) == {1, 2, 3, 4} to_predict = pd.DataFrame({"time": [21, 21], "diet": [5, 6]}) assert mod.predict(to_predict).shape == (2, ) assert mod.predict(to_predict)[0] == mod.predict(to_predict)[1]
def test_chickweight_raise_error_value_col_missing(): df = load_chicken(as_frame=True) mod = GroupedPredictor(estimator=LinearRegression(), groups="diet") mod.fit(df[["time", "diet"]], df["weight"]) with pytest.raises(ValueError): # Former test not valid anymore because we don't check for value columns # mod.predict(df[["diet", "chick"]]) mod.predict(df[["diet"]])
def test_fallback_can_raise_error(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="diet", use_fallback=False) mod.fit(df[['time', 'diet']], df['weight']) to_predict = pd.DataFrame({"time": [21, 21], "diet": [5, 6]}) with pytest.raises(ValueError): mod.predict(to_predict)
def test_bad_shrinkage_value_error(): with pytest.raises(ValueError) as e: df = load_chicken(give_pandas=True) mod = GroupedEstimator( estimator=LinearRegression(), groups="diet", shrinkage="dinosaurhead", ) mod.fit(df[['time', 'diet']], df['weight']) assert "shrinkage function" in str(e)
def test_chickweight_can_do_fallback_proba(): df = load_chicken(as_frame=True) y = np.where(df.weight > df.weight.mean(), 1, 0) mod = GroupedPredictor(estimator=LogisticRegression(), groups="diet") mod.fit(df[["time", "diet"]], y) assert set(mod.estimators_.keys()) == {1, 2, 3, 4} to_predict = pd.DataFrame({"time": [21, 21], "diet": [5, 6]}) assert mod.predict_proba(to_predict).shape == (2, 2) assert (mod.predict_proba(to_predict)[0] == mod.predict_proba(to_predict) [1]).all()
def test_fallback_can_raise_error(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="diet", use_global_model=False, shrinkage=None) mod.fit(df[['time', 'diet']], df['weight']) to_predict = pd.DataFrame({"time": [21, 21], "diet": [5, 6]}) with pytest.raises(ValueError) as e: mod.predict(to_predict) assert "found a group" in str(e)
def test_chickweigt_string_groups(): df = load_chicken(give_pandas=True) df['diet'] = ['omgomgomg' + s for s in df['diet'].astype(str)] X = df[['time', 'diet']] X_np = np.array(X) y = df['weight'] # This should NOT raise errors GroupedEstimator(LinearRegression(), groups=['diet']).fit(X, y).predict(X) GroupedEstimator(LinearRegression(), groups=1).fit(X_np, y).predict(X_np)
def test_chickweigt_string_groups(): df = load_chicken(as_frame=True) df["diet"] = ["omgomgomg" + s for s in df["diet"].astype(str)] X = df[["time", "diet"]] X_np = np.array(X) y = df["weight"] # This should NOT raise errors GroupedPredictor(LinearRegression(), groups=["diet"]).fit(X, y).predict(X) GroupedPredictor(LinearRegression(), groups=1).fit(X_np, y).predict(X_np)
def test_missing_check(): df = load_chicken(as_frame=True) X, y = df.drop(columns='weight'), df['weight'] # create missing value X.loc[0, 'chick'] = np.nan model = make_pipeline(SimpleImputer(), LinearRegression()) # Should not raise error, check is disabled m = GroupedPredictor(model, groups=['diet'], check_X=False).fit(X, y) m.predict(X) # Should raise error, check is still enabled with pytest.raises(ValueError) as e: GroupedPredictor(model, groups=['diet']).fit(X, y) assert "contains NaN" in str(e)
def test_chickweight1(): X, y = load_chicken(return_X_y=True) assert X.shape == (578, 3) assert y.shape[0] == 578
def test_chickweight_df2_keys(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="chick") mod.fit(df[['time', 'chick']], df['weight']) assert set(mod.estimators_.keys()) == set(range(1, 50 + 1))
def test_chickweight_df1_keys(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="diet") mod.fit(df[['time', 'diet']], df['weight']) assert set(mod.estimators_.keys()) == {1, 2, 3, 4}
def test_chickweight_np_keys(): df = load_chicken(as_frame=True) mod = GroupedPredictor(estimator=LinearRegression(), groups=[1, 2]) mod.fit(df[["time", "chick", "diet"]].values, df["weight"].values) # there should still only be 50 groups on this dataset assert len(mod.estimators_.keys()) == 50
def test_chickweight2(): df = load_chicken(as_frame=True) assert df.shape == (578, 4)
def test_chickweight2(): df = load_chicken(give_pandas=True) assert df.shape == (578, 4)
def test_chickweight_df2_keys(): df = load_chicken(as_frame=True) mod = GroupedPredictor(estimator=LinearRegression(), groups="chick") mod.fit(df[["time", "chick"]], df["weight"]) assert set(mod.estimators_.keys()) == set(range(1, 50 + 1))
def test_chickweight_df1_keys(): df = load_chicken(as_frame=True) mod = GroupedPredictor(estimator=LinearRegression(), groups="diet") mod.fit(df[["time", "diet"]], df["weight"]) assert set(mod.estimators_.keys()) == {1, 2, 3, 4}
def test_chickweight1(): X, y = load_chicken() assert X.shape == (578, 3) assert y.shape[0] == 578
def test_chickweight_np_keys(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups=[1, 2]) mod.fit(df[['time', 'chick', 'diet']].values, df['weight'].values) # there should still only be 50 groups on this dataset assert len(mod.estimators_.keys()) == 50
def test_chickweight_raise_error_cols_missing1(): df = load_chicken(give_pandas=True) mod = GroupedEstimator(estimator=LinearRegression(), groups="diet") mod.fit(df[['time', 'diet']], df['weight']) with pytest.raises(KeyError): mod.predict(df[['time', 'chick']])