def test_alpha_param2():
    X, y = load_boston(return_X_y=True)
    df = pd.DataFrame(X,
                      columns=[
                          'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age',
                          'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'
                      ])
    ifilter = InformationFilter(columns=["b", "lstat"], alpha=0.0)
    X_removed = df.drop(columns=["b", "lstat"]).values
    assert np.isclose(ifilter.fit_transform(df), X_removed).all()
예제 #2
0
def test_pipeline_gridsearch():
    X, y = load_boston(return_X_y=True)
    pipe = Pipeline([("info", InformationFilter(columns=[11, 12])),
                     ("model", LinearRegression())])
    mod = GridSearchCV(
        estimator=pipe,
        param_grid={"info__columns": [[], [11], [12], [11, 12]]},
        cv=2)
    assert pd.DataFrame(mod.fit(X, y).cv_results_).shape[0] == 4
def test_output_orthogonal_general_cols():
    X, y = load_boston(return_X_y=True)
    cols = [
        'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
        'ptratio', 'b', 'lstat'
    ]
    df = pd.DataFrame(X, columns=cols)
    for col in cols:
        X_fair = InformationFilter(columns=col).fit_transform(df)
        assert all([(c * df[col]).sum() < 1E-5 for c in X_fair.T])
def test_output_orthogonal_pandas():
    X, y = load_boston(return_X_y=True)
    df = pd.DataFrame(X,
                      columns=[
                          'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age',
                          'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'
                      ])
    X_fair = InformationFilter(columns=["b", "lstat"]).fit_transform(df)
    assert all([(c * df["b"]).sum() < 1E-5 for c in X_fair.T])
    assert all([(c * df["lstat"]).sum() < 1E-5 for c in X_fair.T])
예제 #5
0
def test_alpha_param2():
    X, y = load_boston(return_X_y=True)
    df = pd.DataFrame(
        X,
        columns=[
            "crim",
            "zn",
            "indus",
            "chas",
            "nox",
            "rm",
            "age",
            "dis",
            "rad",
            "tax",
            "ptratio",
            "b",
            "lstat",
        ],
    )
    ifilter = InformationFilter(columns=["b", "lstat"], alpha=0.0)
    X_removed = df.drop(columns=["b", "lstat"]).values
    assert np.isclose(ifilter.fit_transform(df), X_removed).all()
예제 #6
0
def test_output_orthogonal_general_cols():
    X, y = load_boston(return_X_y=True)
    cols = [
        "crim",
        "zn",
        "indus",
        "chas",
        "nox",
        "rm",
        "age",
        "dis",
        "rad",
        "tax",
        "ptratio",
        "b",
        "lstat",
    ]
    df = pd.DataFrame(X, columns=cols)
    for col in cols:
        X_fair = InformationFilter(columns=col).fit_transform(df)
        assert all([(c * df[col]).sum() < 1e-5 for c in X_fair.T])
예제 #7
0
def test_alpha_param1():
    X, y = load_boston(return_X_y=True)
    ifilter = InformationFilter(columns=[11, 12], alpha=0.0)
    X_removed = np.delete(X, [11, 12], axis=1)
    assert np.isclose(ifilter.fit_transform(X), X_removed).all()
예제 #8
0
def test_output_orthogonal():
    X, y = load_boston(return_X_y=True)
    X_fair = InformationFilter(columns=[11, 12]).fit_transform(X)
    assert all([(c * X[:, 11]).sum() < 1e-5 for c in X_fair.T])
    assert all([(c * X[:, 12]).sum() < 1e-5 for c in X_fair.T])
예제 #9
0
def test_v_columns_orthogonal():
    X, y = load_boston(return_X_y=True)
    ifilter = InformationFilter(columns=[11, 12]).fit(X, y)
    v_values = ifilter._make_v_vectors(X, [11, 12])
    assert v_values.prod(axis=1).sum() == pytest.approx(0, abs=1e-5)
예제 #10
0
def test_estimator_checks(test_fn):
    test_fn(InformationFilter.__name__, InformationFilter(columns=[0]))
예제 #11
0
def fairness_cv(estimator, X, y, name="model_x", fairness=True):
    '''
    Runs cross validation on model given feature set and target variable.
    Output dataframe with classification metrics split by race.

    Parameters
    ----------
    estimator : Classification model object
        Save an instance of any classification model and input
    X : Array or DataFrame
        Feature set. Race column must be final column.
    y : Array or DataFrame
        Target variable.
    name : string, optional
        Name of model used in DataFrame. The default is "model_x".
    fairness : Boolean, optional
        True = Apply fairness transformation. The default is True.

    Returns
    -------
    output_df : DataFrame
        Returns DataFrame containing classification metrics split by total,
        Black, and White.

    '''
    # Prepare data for cross validation
    X, y = np.array(X), np.array(y)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=34)

    # Initiate lists to store results
    auc_all, acc_all, prec_all, rec_all, fb_all, pr_all  = [], [], [], [], [], []
    auc_white, acc_white, prec_white, rec_white, fb_white, pr_white  = [], [], [], [], [], []
    auc_black, acc_black, prec_black, rec_black, fb_black, pr_black  = [], [], [], [], [],[]
    p_percent = []

    for train_ind, val_ind in kf.split(X, y):
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind]

        # Standard Scale
        Scaler = StandardScaler()
        X_std_scale = Scaler.fit_transform(X_train)
        X_val_std_scale = Scaler.transform(X_val)

        if fairness:
            # Transform vectors to be orthogonal to race_black = 1
            FairFilter = InformationFilter(
                len(X_std_scale.T) - 1)  #scales on last column (race_black)
            X_scaled = FairFilter.fit_transform(X_std_scale)
            X_test_scaled = FairFilter.transform(X_val_std_scale)
        else:
            X_scaled = X_std_scale
            X_test_scaled = X_val_std_scale

        estimator.fit(X_scaled, y_train)
        soft_pred = estimator.predict_proba(X_test_scaled)[:, 1]
        hard_pred = estimator.predict(X_test_scaled)
        pr = sum(hard_pred) / len(hard_pred)

        mask_white = X_val[:, -1] == 0
        y_val_w = y_val[mask_white]
        soft_pred_w = soft_pred[mask_white]
        hard_pred_w = hard_pred[mask_white]
        pr_w = sum(hard_pred_w) / len(hard_pred_w)

        y_val_b = y_val[~mask_white]
        soft_pred_b = soft_pred[~mask_white]
        hard_pred_b = hard_pred[~mask_white]
        pr_b = sum(hard_pred_b) / len(hard_pred_b)

        # AUC
        auc_all.append(metrics.roc_auc_score(y_val, soft_pred))
        auc_white.append(metrics.roc_auc_score(y_val_w, soft_pred_w))
        auc_black.append(metrics.roc_auc_score(y_val_b, soft_pred_b))

        # Accuracy
        acc_all.append(metrics.accuracy_score(y_val, hard_pred))
        acc_white.append(metrics.accuracy_score(y_val_w, hard_pred_w))
        acc_black.append(metrics.accuracy_score(y_val_b, hard_pred_b))

        # Precision
        prec_all.append(metrics.precision_score(y_val, hard_pred))
        prec_white.append(metrics.precision_score(y_val_w, hard_pred_w))
        prec_black.append(metrics.precision_score(y_val_b, hard_pred_b))

        # Recall
        rec_all.append(metrics.recall_score(y_val, hard_pred))
        rec_white.append(metrics.recall_score(y_val_w, hard_pred_w))
        rec_black.append(metrics.recall_score(y_val_b, hard_pred_b))

        # Fbeta
        fb_all.append(metrics.fbeta_score(y_val, hard_pred, beta=1 / 3))
        fb_white.append(metrics.fbeta_score(y_val_w, hard_pred_w, beta=1 / 3))
        fb_black.append(metrics.fbeta_score(y_val_b, hard_pred_b, beta=1 / 3))

        # Positive Guess Rate
        pr_all.append(pr)
        pr_white.append(pr_w)
        pr_black.append(pr_b)

        # Positive Guess Rate Ratio
        p_percent_ratio = min(pr_b / pr_w, pr_w / pr_b)
        p_percent.append(p_percent_ratio)

        # Name model to use in DataFrame output
        names = [name] * 5

    out_list = [
        names, auc_all, acc_all, prec_all, rec_all, fb_all, pr_all, auc_white,
        acc_white, prec_white, rec_white, fb_white, pr_white, auc_black,
        acc_black, prec_black, rec_black, fb_black, pr_black, p_percent
    ]

    column_list = [
        "names", "auc_all", "acc_all", "prec_all", "rec_all", "fb_all",
        "pr_all", "auc_white", "acc_white", "prec_white", "rec_white",
        "fb_white", "pr_white", "auc_black", "acc_black", "prec_black",
        "rec_black", "fb_black", "pr_black", "p_percent"
    ]

    output_df = pd.DataFrame(np.array(out_list).T, columns=column_list)

    column_num = [n for n in output_df.columns if n not in ['names']]
    for column in column_num:
        output_df[column] = pd.to_numeric(output_df[column])

    return output_df
예제 #12
0
def fairness_train_test(estimator, X, y, fairness=True, name="model"):
    '''
    Runs train/test on model given feature set and target variable.
    Output dataframe with classification metrics split by race.

    Parameters
    ----------
    estimator : Classification model object
        Save an instance of any classification model and input
    X : Array or DataFrame
        Feature set. Race column must be final column.
    y : Array or DataFrame
        Target variable.
    name : string, optional
        Name of model used in DataFrame. The default is "model_x".
    fairness : Boolean, optional
        True = Apply fairness transformation. The default is True.

    Returns
    -------
    output_df : DataFrame
        Returns DataFrame containing classification metrics split by total,
        Black, and White.

    '''
    # Prepare data for cfinal testing
    # Selecting random state to be consistent with intitial split for
    # cross validation testing, and for replicability
    X, X_test, y, y_test = \
    (train_test_split(X,y ,test_size=0.2, random_state=34, stratify=y))

    # Standard Scale
    Scaler = StandardScaler()
    X_std_scale = Scaler.fit_transform(X)
    X_test_std_scale = Scaler.transform(X_test)

    if fairness:
        # Transform vectors to be orthogonal to race_black = 1
        FairFilter = InformationFilter(len(X_std_scale.T) -
                                       1)  #scales on last column (race_black)
        X_scaled = FairFilter.fit_transform(X_std_scale)
        X_test_scaled = FairFilter.transform(X_test_std_scale)
    else:
        X_scaled = X_std_scale
        X_test_scaled = X_test_std_scale

    estimator.fit(X_scaled, y)
    soft_pred = estimator.predict_proba(X_test_scaled)[:, 1]
    hard_pred = estimator.predict(X_test_scaled)
    pr_all = sum(hard_pred) / len(hard_pred)

    mask_white = X_test["race_black"] == 0
    y_test_w = y_test[mask_white]
    soft_pred_w = soft_pred[mask_white]
    hard_pred_w = hard_pred[mask_white]
    pr_white = sum(hard_pred_w) / len(hard_pred_w)

    y_test_b = y_test[~mask_white]
    soft_pred_b = soft_pred[~mask_white]
    hard_pred_b = hard_pred[~mask_white]
    pr_black = sum(hard_pred_b) / len(hard_pred_b)

    # AUC
    auc_all = metrics.roc_auc_score(y_test, soft_pred)
    auc_white = metrics.roc_auc_score(y_test_w, soft_pred_w)
    auc_black = metrics.roc_auc_score(y_test_b, soft_pred_b)

    # Accuracy
    acc_all = metrics.accuracy_score(y_test, hard_pred)
    acc_white = metrics.accuracy_score(y_test_w, hard_pred_w)
    acc_black = metrics.accuracy_score(y_test_b, hard_pred_b)

    # Precision
    prec_all = metrics.precision_score(y_test, hard_pred)
    prec_white = metrics.precision_score(y_test_w, hard_pred_w)
    prec_black = metrics.precision_score(y_test_b, hard_pred_b)

    # Recall
    rec_all = metrics.recall_score(y_test, hard_pred)
    rec_white = metrics.recall_score(y_test_w, hard_pred_w)
    rec_black = metrics.recall_score(y_test_b, hard_pred_b)

    # Fbeta
    fb_all = metrics.fbeta_score(y_test, hard_pred, beta=1 / 3)
    fb_white = metrics.fbeta_score(y_test_w, hard_pred_w, beta=1 / 3)
    fb_black = metrics.fbeta_score(y_test_b, hard_pred_b, beta=1 / 3)

    # Positive Guess Rate Ratio
    p_percent = min(pr_black / pr_white, pr_white / pr_black)

    out_list = [[
        name, auc_all, acc_all, prec_all, rec_all, fb_all, pr_all, auc_white,
        acc_white, prec_white, rec_white, fb_white, pr_white, auc_black,
        acc_black, prec_black, rec_black, fb_black, pr_black, p_percent
    ]]

    column_list = [
        "name", "auc_all", "acc_all", "prec_all", "rec_all", "fb_all",
        "pr_all", "auc_white", "acc_white", "prec_white", "rec_white",
        "fb_white", "pr_white", "auc_black", "acc_black", "prec_black",
        "rec_black", "fb_black", "pr_black", "p_percent"
    ]

    output_df = pd.DataFrame(out_list, columns=column_list)

    column_num = [n for n in output_df.columns if n not in ['name']]
    for column in column_num:
        output_df[column] = pd.to_numeric(output_df[column])

    return output_df
예제 #13
0
# Read in X_train and y_train
X = pd.read_pickle("pickles/X_train.p")
y = pd.read_pickle("pickles/y_train.p")

# Define fbeta scorer, give 1/3 weight to recall vs precision
fb_scorer = make_scorer(fbeta_score, beta=1 / 3)

# Find best parameters for Logistic Regression
params = {
    'model__C': [.0001, .001, .01, .1, 1],
    'model__penalty': ['l1', 'l2'],
    'model__solver': ['lbfgs', 'liblinear']
}

pipeline = Pipeline([('std_scale', StandardScaler()),
                     ('info_scale', InformationFilter(4)),
                     ('model', LogisticRegression())])

grid = GridSearchCV(pipeline,
                    cv=5,
                    n_jobs=-1,
                    param_grid=params,
                    scoring=fb_scorer)
grid.fit(X, y)
grid.best_score_
grid.best_params_

# Find best parameters for KNN
params = {
    'model__n_neighbors': np.linspace(10, 500, 50, dtype=int),
    'model__weights': ['uniform', 'distance']