def test_default_fit_transform_no_selected_variables(): selection_criteria = {"quality_score": {"min": 0.99}} process = BinningProcess(variable_names, selection_criteria=selection_criteria) X_transform = process.fit_transform(X, y, metric="event_rate") assert X_transform == approx(np.empty(0).reshape((X.shape[0], 0)))
def test_incorrect_target_type(): variable_names = ["var_{}".format(i) for i in range(2)] X = np.zeros((2, 2)) y = np.array([[1, 2], [3, 1]]) process = BinningProcess(variable_names) with raises(ValueError): process.fit(X, y)
def test_default_fit_transform(): process = BinningProcess(variable_names) X_transform = process.fit_transform(X, y, metric="event_rate") optb = OptimalBinning() x = X[:, 5] optb.fit(x, y) assert optb.transform(x, metric="event_rate") == approx(X_transform[:, 5], rel=1e-6)
def test_fit_params(): binning_fit_params = {"mean radius": {"max_n_bins": 4}} process = BinningProcess(variable_names=variable_names, binning_fit_params=binning_fit_params) process.fit(X, y) optb = process.get_binned_variable("mean radius") assert optb.status == "OPTIMAL" assert len(optb.splits) <= 4
def test_categorical_variables(): data = load_boston() variable_names = data.feature_names X = data.data y = data.target process = BinningProcess(variable_names, categorical_variables=["CHAS"]) process.fit(X, y, check_input=True) df_summary = process.summary() assert df_summary[df_summary.name == "CHAS"]["dtype"].values[0] == "categorical"
def test_scaling_params(): data = load_breast_cancer() variable_names = data.feature_names df = pd.DataFrame(data.data, columns=variable_names) df["target"] = data.target binning_process = BinningProcess(variable_names) estimator = LogisticRegression() with raises(ValueError): scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator, scaling_method="pdo_odds", scaling_method_params={"pdo": 20}) scorecard.fit(df) with raises(ValueError): scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator, scaling_method="pdo_odds", scaling_method_params={"pdo": 20, "odds": -2, "scorecard_points": -22}) scorecard.fit(df) with raises(ValueError): scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator, scaling_method="min_max", scaling_method_params={"min": "a", "max": 600}) scorecard.fit(df) with raises(ValueError): scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator, scaling_method="min_max", scaling_method_params={"min": 900, "max": 600}) scorecard.fit(df)
def test_params(): with raises(TypeError): cf = Counterfactual(scorecard=None) cf.fit(X_binary) with raises(NotFittedError): binning_process = BinningProcess(feature_names_binary) estimator = LogisticRegression() scorecard = Scorecard(binning_process=binning_process, estimator=estimator) cf = Counterfactual(scorecard=scorecard) cf.fit(X_binary) with raises(TypeError): cf = Counterfactual(scorecard_binary, special_missing=1) cf.fit(X_binary) with raises(ValueError): cf = Counterfactual(scorecard_binary, n_jobs=-1) cf.fit(X_binary) with raises(TypeError): cf = Counterfactual(scorecard_binary, verbose=1) cf.fit(X_binary)
def test_information(): data = load_breast_cancer() variable_names = data.feature_names X = data.data y = data.target process = BinningProcess(variable_names) process.fit(X, y, check_input=True) with raises(ValueError): process.information(print_level=-1) with open("tests/test_binning_process_information.txt", "w") as f: with redirect_stdout(f): process.information(print_level=0) process.information(print_level=1) process.information(print_level=2)
def test_params(): data = load_breast_cancer() variable_names = data.feature_names X = pd.DataFrame(data.data, columns=variable_names) y = data.target binning_process = BinningProcess(variable_names) estimator = LogisticRegression() with raises(TypeError): scorecard = Scorecard(binning_process=estimator, estimator=estimator) scorecard.fit(X, y) with raises(TypeError): scorecard = Scorecard(binning_process=binning_process, estimator=binning_process) scorecard.fit(X, y) with raises(ValueError): scorecard = Scorecard(binning_process=binning_process, estimator=estimator, scaling_method="new_method", scaling_method_params=dict()) scorecard.fit(X, y) with raises(ValueError): scorecard = Scorecard(binning_process=binning_process, estimator=estimator, scaling_method="min_max", scaling_method_params=None) scorecard.fit(X, y) with raises(TypeError): scorecard = Scorecard(binning_process=binning_process, estimator=estimator, scaling_method="min_max", scaling_method_params=[]) scorecard.fit(X, y) with raises(TypeError): scorecard = Scorecard(binning_process=binning_process, estimator=estimator, intercept_based=1) scorecard.fit(X, y) with raises(TypeError): scorecard = Scorecard(binning_process=binning_process, estimator=estimator, reverse_scorecard=1) scorecard.fit(X, y) with raises(TypeError): scorecard = Scorecard(binning_process=binning_process, estimator=estimator, rounding=1) scorecard.fit(X, y) with raises(TypeError): scorecard = Scorecard(binning_process=binning_process, estimator=estimator, verbose=1) scorecard.fit(X, y)
def _fit_scorecard(target_dtype, X_train, y_train): if target_dtype == "binary": estimator = LogisticRegression() else: estimator = LinearRegression() variable_names = list(X_train.columns) binning_process = BinningProcess(variable_names) scorecard = Scorecard(binning_process=binning_process, estimator=estimator).fit(X_train, y_train) return scorecard
def test_default(): process = BinningProcess(variable_names) process.fit(X, y, check_input=True) with raises(TypeError): process.get_binned_variable(1) with raises(ValueError): process.get_binned_variable("new_variable") optb = process.get_binned_variable("mean radius") assert optb.status == "OPTIMAL" assert optb.splits == approx([11.42500019, 12.32999992, 13.09499979, 13.70499992, 15.04500008, 16.92500019], rel=1e-6) optb.binning_table.build() assert optb.binning_table.iv == approx(5.04392547, rel=1e-6)
def test_input(): data = load_breast_cancer() variable_names = data.feature_names X = pd.DataFrame(data.data, columns=variable_names) y = data.target y[0] = 4 binning_process = BinningProcess(variable_names) estimator = LogisticRegression() with raises(ValueError): scorecard = Scorecard(binning_process=binning_process, estimator=estimator) scorecard.fit(X, y)
def test_scaling_method_params_continuous_pdo_odds(): data = load_boston() variable_names = data.feature_names df = pd.DataFrame(data.data, columns=variable_names) df["target"] = data.target with raises(ValueError): estimator = LinearRegression() binning_process = BinningProcess(variable_names) scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator, scaling_method="pdo_odds", scaling_method_params={}) scorecard.fit(df)
def test_verbose(): data = load_breast_cancer() variable_names = data.feature_names df = pd.DataFrame(data.data, columns=variable_names) df["target"] = data.target binning_process = BinningProcess(variable_names) estimator = LogisticRegression() scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator, verbose=True) with open("tests/test_scorecard_verbose.txt", "w") as f: with redirect_stdout(f): scorecard.fit(df)
def test_default_transform(): process = BinningProcess(variable_names) with raises(NotFittedError): process.transform(X) process.fit(X, y) X_transform = process.transform(X) optb = OptimalBinning() x = X[:, 5] optb.fit(x, y) assert optb.transform(x) == approx(X_transform[:, 5], rel=1e-6)
def test_input(): data = load_breast_cancer() variable_names = data.feature_names df = pd.DataFrame(data.data, columns=variable_names) target = data.target target[0] = 4 df["target"] = target binning_process = BinningProcess(variable_names) estimator = LogisticRegression() with raises(ValueError): scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator) scorecard.fit(df)
def test_estimator_not_coef(): from sklearn.ensemble import RandomForestClassifier data = load_breast_cancer() variable_names = data.feature_names X = pd.DataFrame(data.data, columns=variable_names) y = data.target binning_process = BinningProcess(variable_names) estimator = RandomForestClassifier() scorecard = Scorecard(binning_process=binning_process, estimator=estimator) with raises(RuntimeError): scorecard.fit(X, y)
def test_transform_some_variables(): process = BinningProcess(variable_names) process.fit(X, y) with raises(TypeError): process.transform(X, {}) with raises(ValueError): process.transform(X, ["new_1", "new_2"]) selected_variables = [ 'mean area', 'mean smoothness', 'mean compactness', 'mean concavity' ] X_transform = process.transform(X, selected_variables) assert X_transform.shape[1] == 4 for i in range(3, 7): optb = OptimalBinning() x = X[:, i] optb.fit(x, y) assert optb.transform(x) == approx(X_transform[:, i - 3], rel=1e-6)
def test_default_continuous(): data = load_boston() variable_names = data.feature_names df = pd.DataFrame(data.data, columns=variable_names) df["target"] = data.target binning_process = BinningProcess(variable_names) estimator = LinearRegression() scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator).fit(df) sct = scorecard.table(style="detailed") sc_min, sc_max = sct.groupby("Variable").agg( {'Points': [np.min, np.max]}).sum() assert sc_min == approx(-15.813545796848476, rel=1e-6) assert sc_max == approx(85.08156623609487, rel=1e-6)
def buildScoreCard(df, features, labelCol): binning_process = BinningProcess(features) estimator = HuberRegressor(max_iter=200) scorecard = Scorecard(binning_process=binning_process, target=labelCol, estimator=estimator, scaling_method=None, scaling_method_params={"min": 0, "max": 100}, reverse_scorecard=True) scorecard.verbose = True scorecard.fit(df, check_input=False) scorecard.information(print_level=2) print(scorecard.table(style="summary")) score = scorecard.score(df) y_pred = scorecard.predict(df) plt.scatter(score, df[labelCol], alpha=0.01, label="Average profit") plt.plot(score, y_pred, label="Huber regression", linewidth=2, color="orange") plt.ylabel("Average profit value (unit=100,000)") plt.xlabel("Score") plt.legend() plt.show()
def test_default_pandas(): df = pd.DataFrame(data.data, columns=data.feature_names) process = BinningProcess(variable_names) with raises(TypeError): process.fit(df.to_dict(), y, check_input=True) process.fit(df, y, check_input=True) optb = process.get_binned_variable("mean radius") assert optb.status == "OPTIMAL" assert optb.splits == approx([11.42500019, 12.32999992, 13.09499979, 13.70499992, 15.04500008, 16.92500019], rel=1e-6) optb.binning_table.build() assert optb.binning_table.iv == approx(5.04392547, rel=1e-6)
def test_scaling_method_min_max(): data = load_breast_cancer() variable_names = data.feature_names df = pd.DataFrame(data.data, columns=variable_names) df["target"] = data.target binning_process = BinningProcess(variable_names) estimator = LogisticRegression() scaling_method_params = {"min": 300, "max": 850} scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator, scaling_method="min_max", scaling_method_params=scaling_method_params).fit(df) sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( {'Points': [np.min, np.max]}).sum() assert sc_min == approx(300, rel=1e-6) assert sc_max == approx(850, rel=1e-6)
def test_default(): data = load_breast_cancer() variable_names = data.feature_names df = pd.DataFrame(data.data, columns=variable_names) df["target"] = data.target binning_process = BinningProcess(variable_names) estimator = LogisticRegression() scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator).fit(df) with raises(ValueError): sct = scorecard.table(style="new") sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( {'Points': [np.min, np.max]}).sum() assert sc_min == approx(-43.65762593147646, rel=1e-6) assert sc_max == approx(42.69694657427327, rel=1e-6)
def test_default_transform_pandas(): df = pd.DataFrame(data.data, columns=data.feature_names) process = BinningProcess(variable_names) process.fit(df, y) with raises(TypeError): X_transform = process.transform(df.to_dict()) with raises(ValueError): X_transform = process.transform(df) X_transform = process.transform(df, data.feature_names) optb = OptimalBinning() x = X[:, 5] optb.fit(x, y) assert optb.transform(x) == approx(X_transform[:, 5], rel=1e-6)
def test_scaling_method_pdo_odd(): data = load_breast_cancer() variable_names = data.feature_names df = pd.DataFrame(data.data, columns=variable_names) df["target"] = data.target odds = 1 / data.target.mean() binning_process = BinningProcess(variable_names) estimator = LogisticRegression() scaling_method_params = {"pdo": 20, "odds": odds, "scorecard_points": 600} scorecard = Scorecard(target="target", binning_process=binning_process, estimator=estimator, scaling_method="pdo_odds", scaling_method_params=scaling_method_params).fit(df) sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( {'Points': [np.min, np.max]}).sum() assert sc_min == approx(-612.2266586867094, rel=1e-6) assert sc_max == approx(1879.4396115559216, rel=1e-6)
def test_rounding(): data = load_breast_cancer() variable_names = data.feature_names X = pd.DataFrame(data.data, columns=variable_names) y = data.target binning_process = BinningProcess(variable_names) estimator = LogisticRegression() scaling_method_params = {"min": 200.52, "max": 850.66} scorecard = Scorecard(binning_process=binning_process, estimator=estimator, scaling_method="min_max", scaling_method_params=scaling_method_params, rounding=True).fit(X, y) sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( {'Points': [np.min, np.max]}).sum() assert sc_min == approx(201, rel=1e-6) assert sc_max == approx(851, rel=1e-6)
def test_default_transform_multiclass(): data = load_wine() variable_names = data.feature_names X = data.data y = data.target process = BinningProcess(variable_names) process.fit(X, y) X_transform = process.transform(X) optb = process.get_binned_variable(variable_names[0]) assert isinstance(optb, MulticlassOptimalBinning) optb = MulticlassOptimalBinning() x = X[:, 5] optb.fit(x, y) assert optb.transform(x) == approx(X_transform[:, 5], rel=1e-6)
def test_rounding_pdo_odds(): data = load_breast_cancer() variable_names = data.feature_names X = pd.DataFrame(data.data, columns=variable_names) y = data.target odds = 1 / data.target.mean() binning_process = BinningProcess(variable_names) estimator = LogisticRegression() scaling_method_params = {"pdo": 20, "odds": odds, "scorecard_points": 600} scorecard = Scorecard(binning_process=binning_process, estimator=estimator, scaling_method="pdo_odds", scaling_method_params=scaling_method_params, rounding=True).fit(X, y) sct = scorecard.table(style="summary") sc_min, sc_max = sct.groupby("Variable").agg( {'Points': [np.min, np.max]}).sum() assert sc_min == approx(-612, rel=1e-6) assert sc_max == approx(1880, rel=1e-6)
def test_information(): data = load_breast_cancer() variable_names = data.feature_names X = pd.DataFrame(data.data, columns=variable_names) y = data.target binning_process = BinningProcess(variable_names) estimator = LogisticRegression() scorecard = Scorecard(binning_process=binning_process, estimator=estimator) with raises(NotFittedError): scorecard.information() scorecard.fit(X, y) with raises(ValueError): scorecard.information(print_level=-1) with open("tests/test_scorecard_information.txt", "w") as f: with redirect_stdout(f): scorecard.information(print_level=0) scorecard.information(print_level=1) scorecard.information(print_level=2)
def test_default_transform_continuous(): data = load_boston() variable_names = data.feature_names X = data.data y = data.target process = BinningProcess(variable_names) process.fit(X, y) X_transform = process.transform(X, metric="mean") optb = process.get_binned_variable(variable_names[0]) assert isinstance(optb, ContinuousOptimalBinning) optb = ContinuousOptimalBinning() x = X[:, 5] optb.fit(x, y) assert optb.transform(x, metric="mean") == approx( X_transform[:, 5], rel=1e-6)