예제 #1
0
def test_default_fit_transform_no_selected_variables():
    selection_criteria = {"quality_score": {"min": 0.99}}
    process = BinningProcess(variable_names,
                             selection_criteria=selection_criteria)

    X_transform = process.fit_transform(X, y, metric="event_rate")
    assert X_transform == approx(np.empty(0).reshape((X.shape[0], 0)))
예제 #2
0
def test_incorrect_target_type():
    variable_names = ["var_{}".format(i) for i in range(2)]
    X = np.zeros((2, 2))
    y = np.array([[1, 2], [3, 1]])
    process = BinningProcess(variable_names)

    with raises(ValueError):
        process.fit(X, y)
예제 #3
0
def test_default_fit_transform():
    process = BinningProcess(variable_names)
    X_transform = process.fit_transform(X, y, metric="event_rate")

    optb = OptimalBinning()
    x = X[:, 5]
    optb.fit(x, y)

    assert optb.transform(x, metric="event_rate") == approx(X_transform[:, 5],
                                                            rel=1e-6)
예제 #4
0
def test_fit_params():
    binning_fit_params = {"mean radius": {"max_n_bins": 4}}

    process = BinningProcess(variable_names=variable_names,
                             binning_fit_params=binning_fit_params)
    process.fit(X, y)

    optb = process.get_binned_variable("mean radius")

    assert optb.status == "OPTIMAL"
    assert len(optb.splits) <= 4
예제 #5
0
def test_categorical_variables():
    data = load_boston()

    variable_names = data.feature_names
    X = data.data
    y = data.target

    process = BinningProcess(variable_names, categorical_variables=["CHAS"])
    process.fit(X, y, check_input=True)

    df_summary = process.summary()
    assert df_summary[df_summary.name ==
                      "CHAS"]["dtype"].values[0] == "categorical"
예제 #6
0
def test_scaling_params():
    data = load_breast_cancer()

    variable_names = data.feature_names
    df = pd.DataFrame(data.data, columns=variable_names)
    df["target"] = data.target

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()

    with raises(ValueError):
        scorecard = Scorecard(target="target", binning_process=binning_process,
                              estimator=estimator, scaling_method="pdo_odds",
                              scaling_method_params={"pdo": 20})
        scorecard.fit(df)

    with raises(ValueError):
        scorecard = Scorecard(target="target", binning_process=binning_process,
                              estimator=estimator, scaling_method="pdo_odds",
                              scaling_method_params={"pdo": 20, "odds": -2,
                                                     "scorecard_points": -22})
        scorecard.fit(df)

    with raises(ValueError):
        scorecard = Scorecard(target="target", binning_process=binning_process,
                              estimator=estimator, scaling_method="min_max",
                              scaling_method_params={"min": "a", "max": 600})
        scorecard.fit(df)

    with raises(ValueError):
        scorecard = Scorecard(target="target", binning_process=binning_process,
                              estimator=estimator, scaling_method="min_max",
                              scaling_method_params={"min": 900, "max": 600})
        scorecard.fit(df)
예제 #7
0
def test_params():
    with raises(TypeError):
        cf = Counterfactual(scorecard=None)
        cf.fit(X_binary)

    with raises(NotFittedError):
        binning_process = BinningProcess(feature_names_binary)
        estimator = LogisticRegression()
        scorecard = Scorecard(binning_process=binning_process,
                              estimator=estimator)

        cf = Counterfactual(scorecard=scorecard)
        cf.fit(X_binary)

    with raises(TypeError):
        cf = Counterfactual(scorecard_binary, special_missing=1)
        cf.fit(X_binary)

    with raises(ValueError):
        cf = Counterfactual(scorecard_binary, n_jobs=-1)
        cf.fit(X_binary)

    with raises(TypeError):
        cf = Counterfactual(scorecard_binary, verbose=1)
        cf.fit(X_binary)
예제 #8
0
def test_information():
    data = load_breast_cancer()

    variable_names = data.feature_names
    X = data.data
    y = data.target

    process = BinningProcess(variable_names)
    process.fit(X, y, check_input=True)

    with raises(ValueError):
        process.information(print_level=-1)

    with open("tests/test_binning_process_information.txt", "w") as f:
        with redirect_stdout(f):
            process.information(print_level=0)
            process.information(print_level=1)
            process.information(print_level=2)
def test_params():
    data = load_breast_cancer()
    variable_names = data.feature_names
    X = pd.DataFrame(data.data, columns=variable_names)
    y = data.target

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()

    with raises(TypeError):
        scorecard = Scorecard(binning_process=estimator,
                              estimator=estimator)
        scorecard.fit(X, y)

    with raises(TypeError):
        scorecard = Scorecard(binning_process=binning_process,
                              estimator=binning_process)
        scorecard.fit(X, y)

    with raises(ValueError):
        scorecard = Scorecard(binning_process=binning_process,
                              estimator=estimator, scaling_method="new_method",
                              scaling_method_params=dict())
        scorecard.fit(X, y)

    with raises(ValueError):
        scorecard = Scorecard(binning_process=binning_process,
                              estimator=estimator, scaling_method="min_max",
                              scaling_method_params=None)
        scorecard.fit(X, y)

    with raises(TypeError):
        scorecard = Scorecard(binning_process=binning_process,
                              estimator=estimator, scaling_method="min_max",
                              scaling_method_params=[])
        scorecard.fit(X, y)

    with raises(TypeError):
        scorecard = Scorecard(binning_process=binning_process,
                              estimator=estimator, intercept_based=1)
        scorecard.fit(X, y)

    with raises(TypeError):
        scorecard = Scorecard(binning_process=binning_process,
                              estimator=estimator, reverse_scorecard=1)
        scorecard.fit(X, y)

    with raises(TypeError):
        scorecard = Scorecard(binning_process=binning_process,
                              estimator=estimator, rounding=1)
        scorecard.fit(X, y)

    with raises(TypeError):
        scorecard = Scorecard(binning_process=binning_process,
                              estimator=estimator, verbose=1)
        scorecard.fit(X, y)
def _fit_scorecard(target_dtype, X_train, y_train):
    if target_dtype == "binary":
        estimator = LogisticRegression()
    else:
        estimator = LinearRegression()

    variable_names = list(X_train.columns)
    binning_process = BinningProcess(variable_names)
    scorecard = Scorecard(binning_process=binning_process,
                          estimator=estimator).fit(X_train, y_train)

    return scorecard
예제 #11
0
def test_default():
    process = BinningProcess(variable_names)
    process.fit(X, y, check_input=True)

    with raises(TypeError):
        process.get_binned_variable(1)

    with raises(ValueError):
        process.get_binned_variable("new_variable")

    optb = process.get_binned_variable("mean radius")

    assert optb.status == "OPTIMAL"
    assert optb.splits == approx([11.42500019, 12.32999992, 13.09499979,
                                  13.70499992, 15.04500008, 16.92500019],
                                 rel=1e-6)

    optb.binning_table.build()
    assert optb.binning_table.iv == approx(5.04392547, rel=1e-6)
def test_input():
    data = load_breast_cancer()
    variable_names = data.feature_names
    X = pd.DataFrame(data.data, columns=variable_names)
    y = data.target
    y[0] = 4

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()

    with raises(ValueError):
        scorecard = Scorecard(binning_process=binning_process,
                              estimator=estimator)
        scorecard.fit(X, y)
예제 #13
0
def test_scaling_method_params_continuous_pdo_odds():
    data = load_boston()
    variable_names = data.feature_names
    df = pd.DataFrame(data.data, columns=variable_names)
    df["target"] = data.target

    with raises(ValueError):
        estimator = LinearRegression()
        binning_process = BinningProcess(variable_names)

        scorecard = Scorecard(target="target", binning_process=binning_process,
                              estimator=estimator, scaling_method="pdo_odds",
                              scaling_method_params={})
        scorecard.fit(df)
예제 #14
0
def test_verbose():
    data = load_breast_cancer()
    variable_names = data.feature_names
    df = pd.DataFrame(data.data, columns=variable_names)
    df["target"] = data.target

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()
    scorecard = Scorecard(target="target", binning_process=binning_process,
                          estimator=estimator, verbose=True)

    with open("tests/test_scorecard_verbose.txt", "w") as f:
        with redirect_stdout(f):
            scorecard.fit(df)
예제 #15
0
def test_default_transform():
    process = BinningProcess(variable_names)
    with raises(NotFittedError):
        process.transform(X)

    process.fit(X, y)
    X_transform = process.transform(X)

    optb = OptimalBinning()
    x = X[:, 5]
    optb.fit(x, y)

    assert optb.transform(x) == approx(X_transform[:, 5], rel=1e-6)
예제 #16
0
def test_input():
    data = load_breast_cancer()
    variable_names = data.feature_names
    df = pd.DataFrame(data.data, columns=variable_names)
    target = data.target
    target[0] = 4
    df["target"] = target

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()

    with raises(ValueError):
        scorecard = Scorecard(target="target", binning_process=binning_process,
                              estimator=estimator)
        scorecard.fit(df)
def test_estimator_not_coef():
    from sklearn.ensemble import RandomForestClassifier

    data = load_breast_cancer()
    variable_names = data.feature_names
    X = pd.DataFrame(data.data, columns=variable_names)
    y = data.target

    binning_process = BinningProcess(variable_names)
    estimator = RandomForestClassifier()

    scorecard = Scorecard(binning_process=binning_process, estimator=estimator)

    with raises(RuntimeError):
        scorecard.fit(X, y)
예제 #18
0
def test_transform_some_variables():
    process = BinningProcess(variable_names)
    process.fit(X, y)

    with raises(TypeError):
        process.transform(X, {})

    with raises(ValueError):
        process.transform(X, ["new_1", "new_2"])

    selected_variables = [
        'mean area', 'mean smoothness', 'mean compactness', 'mean concavity'
    ]

    X_transform = process.transform(X, selected_variables)
    assert X_transform.shape[1] == 4

    for i in range(3, 7):
        optb = OptimalBinning()
        x = X[:, i]
        optb.fit(x, y)

        assert optb.transform(x) == approx(X_transform[:, i - 3], rel=1e-6)
예제 #19
0
def test_default_continuous():
    data = load_boston()
    variable_names = data.feature_names
    df = pd.DataFrame(data.data, columns=variable_names)
    df["target"] = data.target

    binning_process = BinningProcess(variable_names)
    estimator = LinearRegression()

    scorecard = Scorecard(target="target", binning_process=binning_process,
                          estimator=estimator).fit(df)

    sct = scorecard.table(style="detailed")
    sc_min, sc_max = sct.groupby("Variable").agg(
        {'Points': [np.min, np.max]}).sum()

    assert sc_min == approx(-15.813545796848476, rel=1e-6)
    assert sc_max == approx(85.08156623609487, rel=1e-6)
예제 #20
0
def buildScoreCard(df, features, labelCol):
    binning_process = BinningProcess(features)
    estimator = HuberRegressor(max_iter=200)
    scorecard = Scorecard(binning_process=binning_process, target=labelCol,
                          estimator=estimator, scaling_method=None,
                          scaling_method_params={"min": 0, "max": 100},
                          reverse_scorecard=True)
    scorecard.verbose = True
    scorecard.fit(df, check_input=False)
    scorecard.information(print_level=2)
    print(scorecard.table(style="summary"))
    score = scorecard.score(df)
    y_pred = scorecard.predict(df)
    plt.scatter(score, df[labelCol], alpha=0.01, label="Average profit")
    plt.plot(score, y_pred, label="Huber regression", linewidth=2, color="orange")
    plt.ylabel("Average profit value (unit=100,000)")
    plt.xlabel("Score")
    plt.legend()
    plt.show()
예제 #21
0
def test_default_pandas():
    df = pd.DataFrame(data.data, columns=data.feature_names)

    process = BinningProcess(variable_names)

    with raises(TypeError):
        process.fit(df.to_dict(), y, check_input=True)

    process.fit(df, y, check_input=True)

    optb = process.get_binned_variable("mean radius")

    assert optb.status == "OPTIMAL"
    assert optb.splits == approx([11.42500019, 12.32999992, 13.09499979,
                                  13.70499992, 15.04500008, 16.92500019],
                                 rel=1e-6)

    optb.binning_table.build()
    assert optb.binning_table.iv == approx(5.04392547, rel=1e-6)
예제 #22
0
def test_scaling_method_min_max():
    data = load_breast_cancer()
    variable_names = data.feature_names
    df = pd.DataFrame(data.data, columns=variable_names)
    df["target"] = data.target

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()

    scaling_method_params = {"min": 300, "max": 850}

    scorecard = Scorecard(target="target", binning_process=binning_process,
                          estimator=estimator, scaling_method="min_max",
                          scaling_method_params=scaling_method_params).fit(df)

    sct = scorecard.table(style="summary")
    sc_min, sc_max = sct.groupby("Variable").agg(
        {'Points': [np.min, np.max]}).sum()

    assert sc_min == approx(300, rel=1e-6)
    assert sc_max == approx(850, rel=1e-6)
예제 #23
0
def test_default():
    data = load_breast_cancer()
    variable_names = data.feature_names
    df = pd.DataFrame(data.data, columns=variable_names)
    df["target"] = data.target

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()

    scorecard = Scorecard(target="target", binning_process=binning_process,
                          estimator=estimator).fit(df)

    with raises(ValueError):
        sct = scorecard.table(style="new")

    sct = scorecard.table(style="summary")
    sc_min, sc_max = sct.groupby("Variable").agg(
        {'Points': [np.min, np.max]}).sum()

    assert sc_min == approx(-43.65762593147646, rel=1e-6)
    assert sc_max == approx(42.69694657427327, rel=1e-6)
예제 #24
0
def test_default_transform_pandas():
    df = pd.DataFrame(data.data, columns=data.feature_names)

    process = BinningProcess(variable_names)
    process.fit(df, y)

    with raises(TypeError):
        X_transform = process.transform(df.to_dict())

    with raises(ValueError):
        X_transform = process.transform(df)

    X_transform = process.transform(df, data.feature_names)

    optb = OptimalBinning()
    x = X[:, 5]
    optb.fit(x, y)

    assert optb.transform(x) == approx(X_transform[:, 5], rel=1e-6)
예제 #25
0
def test_scaling_method_pdo_odd():
    data = load_breast_cancer()
    variable_names = data.feature_names
    df = pd.DataFrame(data.data, columns=variable_names)
    df["target"] = data.target
    odds = 1 / data.target.mean()

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()

    scaling_method_params = {"pdo": 20, "odds": odds, "scorecard_points": 600}

    scorecard = Scorecard(target="target", binning_process=binning_process,
                          estimator=estimator, scaling_method="pdo_odds",
                          scaling_method_params=scaling_method_params).fit(df)

    sct = scorecard.table(style="summary")
    sc_min, sc_max = sct.groupby("Variable").agg(
        {'Points': [np.min, np.max]}).sum()

    assert sc_min == approx(-612.2266586867094, rel=1e-6)
    assert sc_max == approx(1879.4396115559216, rel=1e-6)
def test_rounding():
    data = load_breast_cancer()
    variable_names = data.feature_names
    X = pd.DataFrame(data.data, columns=variable_names)
    y = data.target

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()

    scaling_method_params = {"min": 200.52, "max": 850.66}

    scorecard = Scorecard(binning_process=binning_process,
                          estimator=estimator, scaling_method="min_max",
                          scaling_method_params=scaling_method_params,
                          rounding=True).fit(X, y)

    sct = scorecard.table(style="summary")
    sc_min, sc_max = sct.groupby("Variable").agg(
        {'Points': [np.min, np.max]}).sum()

    assert sc_min == approx(201, rel=1e-6)
    assert sc_max == approx(851, rel=1e-6)
예제 #27
0
def test_default_transform_multiclass():
    data = load_wine()
    variable_names = data.feature_names
    X = data.data
    y = data.target

    process = BinningProcess(variable_names)
    process.fit(X, y)
    X_transform = process.transform(X)

    optb = process.get_binned_variable(variable_names[0])
    assert isinstance(optb, MulticlassOptimalBinning)

    optb = MulticlassOptimalBinning()
    x = X[:, 5]
    optb.fit(x, y)
    assert optb.transform(x) == approx(X_transform[:, 5], rel=1e-6)
def test_rounding_pdo_odds():
    data = load_breast_cancer()
    variable_names = data.feature_names
    X = pd.DataFrame(data.data, columns=variable_names)
    y = data.target
    odds = 1 / data.target.mean()

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()

    scaling_method_params = {"pdo": 20, "odds": odds, "scorecard_points": 600}

    scorecard = Scorecard(binning_process=binning_process,
                          estimator=estimator, scaling_method="pdo_odds",
                          scaling_method_params=scaling_method_params,
                          rounding=True).fit(X, y)

    sct = scorecard.table(style="summary")
    sc_min, sc_max = sct.groupby("Variable").agg(
        {'Points': [np.min, np.max]}).sum()

    assert sc_min == approx(-612, rel=1e-6)
    assert sc_max == approx(1880, rel=1e-6)
def test_information():
    data = load_breast_cancer()
    variable_names = data.feature_names
    X = pd.DataFrame(data.data, columns=variable_names)
    y = data.target

    binning_process = BinningProcess(variable_names)
    estimator = LogisticRegression()
    scorecard = Scorecard(binning_process=binning_process, estimator=estimator)

    with raises(NotFittedError):
        scorecard.information()

    scorecard.fit(X, y)

    with raises(ValueError):
        scorecard.information(print_level=-1)

    with open("tests/test_scorecard_information.txt", "w") as f:
        with redirect_stdout(f):
            scorecard.information(print_level=0)
            scorecard.information(print_level=1)
            scorecard.information(print_level=2)
예제 #30
0
def test_default_transform_continuous():
    data = load_boston()
    variable_names = data.feature_names
    X = data.data
    y = data.target

    process = BinningProcess(variable_names)
    process.fit(X, y)
    X_transform = process.transform(X, metric="mean")

    optb = process.get_binned_variable(variable_names[0])
    assert isinstance(optb, ContinuousOptimalBinning)

    optb = ContinuousOptimalBinning()
    x = X[:, 5]
    optb.fit(x, y)
    assert optb.transform(x, metric="mean") == approx(
        X_transform[:, 5], rel=1e-6)