def test_alpha_param2(): X, y = load_boston(return_X_y=True) df = pd.DataFrame(X, columns=[ 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat' ]) ifilter = InformationFilter(columns=["b", "lstat"], alpha=0.0) X_removed = df.drop(columns=["b", "lstat"]).values assert np.isclose(ifilter.fit_transform(df), X_removed).all()
def test_pipeline_gridsearch(): X, y = load_boston(return_X_y=True) pipe = Pipeline([("info", InformationFilter(columns=[11, 12])), ("model", LinearRegression())]) mod = GridSearchCV( estimator=pipe, param_grid={"info__columns": [[], [11], [12], [11, 12]]}, cv=2) assert pd.DataFrame(mod.fit(X, y).cv_results_).shape[0] == 4
def test_output_orthogonal_general_cols(): X, y = load_boston(return_X_y=True) cols = [ 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat' ] df = pd.DataFrame(X, columns=cols) for col in cols: X_fair = InformationFilter(columns=col).fit_transform(df) assert all([(c * df[col]).sum() < 1E-5 for c in X_fair.T])
def test_output_orthogonal_pandas(): X, y = load_boston(return_X_y=True) df = pd.DataFrame(X, columns=[ 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat' ]) X_fair = InformationFilter(columns=["b", "lstat"]).fit_transform(df) assert all([(c * df["b"]).sum() < 1E-5 for c in X_fair.T]) assert all([(c * df["lstat"]).sum() < 1E-5 for c in X_fair.T])
def test_alpha_param2(): X, y = load_boston(return_X_y=True) df = pd.DataFrame( X, columns=[ "crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat", ], ) ifilter = InformationFilter(columns=["b", "lstat"], alpha=0.0) X_removed = df.drop(columns=["b", "lstat"]).values assert np.isclose(ifilter.fit_transform(df), X_removed).all()
def test_output_orthogonal_general_cols(): X, y = load_boston(return_X_y=True) cols = [ "crim", "zn", "indus", "chas", "nox", "rm", "age", "dis", "rad", "tax", "ptratio", "b", "lstat", ] df = pd.DataFrame(X, columns=cols) for col in cols: X_fair = InformationFilter(columns=col).fit_transform(df) assert all([(c * df[col]).sum() < 1e-5 for c in X_fair.T])
def test_alpha_param1(): X, y = load_boston(return_X_y=True) ifilter = InformationFilter(columns=[11, 12], alpha=0.0) X_removed = np.delete(X, [11, 12], axis=1) assert np.isclose(ifilter.fit_transform(X), X_removed).all()
def test_output_orthogonal(): X, y = load_boston(return_X_y=True) X_fair = InformationFilter(columns=[11, 12]).fit_transform(X) assert all([(c * X[:, 11]).sum() < 1e-5 for c in X_fair.T]) assert all([(c * X[:, 12]).sum() < 1e-5 for c in X_fair.T])
def test_v_columns_orthogonal(): X, y = load_boston(return_X_y=True) ifilter = InformationFilter(columns=[11, 12]).fit(X, y) v_values = ifilter._make_v_vectors(X, [11, 12]) assert v_values.prod(axis=1).sum() == pytest.approx(0, abs=1e-5)
def test_estimator_checks(test_fn): test_fn(InformationFilter.__name__, InformationFilter(columns=[0]))
def fairness_cv(estimator, X, y, name="model_x", fairness=True): ''' Runs cross validation on model given feature set and target variable. Output dataframe with classification metrics split by race. Parameters ---------- estimator : Classification model object Save an instance of any classification model and input X : Array or DataFrame Feature set. Race column must be final column. y : Array or DataFrame Target variable. name : string, optional Name of model used in DataFrame. The default is "model_x". fairness : Boolean, optional True = Apply fairness transformation. The default is True. Returns ------- output_df : DataFrame Returns DataFrame containing classification metrics split by total, Black, and White. ''' # Prepare data for cross validation X, y = np.array(X), np.array(y) kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=34) # Initiate lists to store results auc_all, acc_all, prec_all, rec_all, fb_all, pr_all = [], [], [], [], [], [] auc_white, acc_white, prec_white, rec_white, fb_white, pr_white = [], [], [], [], [], [] auc_black, acc_black, prec_black, rec_black, fb_black, pr_black = [], [], [], [], [],[] p_percent = [] for train_ind, val_ind in kf.split(X, y): X_train, y_train = X[train_ind], y[train_ind] X_val, y_val = X[val_ind], y[val_ind] # Standard Scale Scaler = StandardScaler() X_std_scale = Scaler.fit_transform(X_train) X_val_std_scale = Scaler.transform(X_val) if fairness: # Transform vectors to be orthogonal to race_black = 1 FairFilter = InformationFilter( len(X_std_scale.T) - 1) #scales on last column (race_black) X_scaled = FairFilter.fit_transform(X_std_scale) X_test_scaled = FairFilter.transform(X_val_std_scale) else: X_scaled = X_std_scale X_test_scaled = X_val_std_scale estimator.fit(X_scaled, y_train) soft_pred = estimator.predict_proba(X_test_scaled)[:, 1] hard_pred = estimator.predict(X_test_scaled) pr = sum(hard_pred) / len(hard_pred) mask_white = X_val[:, -1] == 0 y_val_w = y_val[mask_white] soft_pred_w = soft_pred[mask_white] hard_pred_w = hard_pred[mask_white] pr_w = sum(hard_pred_w) / len(hard_pred_w) y_val_b = y_val[~mask_white] soft_pred_b = soft_pred[~mask_white] hard_pred_b = hard_pred[~mask_white] pr_b = sum(hard_pred_b) / len(hard_pred_b) # AUC auc_all.append(metrics.roc_auc_score(y_val, soft_pred)) auc_white.append(metrics.roc_auc_score(y_val_w, soft_pred_w)) auc_black.append(metrics.roc_auc_score(y_val_b, soft_pred_b)) # Accuracy acc_all.append(metrics.accuracy_score(y_val, hard_pred)) acc_white.append(metrics.accuracy_score(y_val_w, hard_pred_w)) acc_black.append(metrics.accuracy_score(y_val_b, hard_pred_b)) # Precision prec_all.append(metrics.precision_score(y_val, hard_pred)) prec_white.append(metrics.precision_score(y_val_w, hard_pred_w)) prec_black.append(metrics.precision_score(y_val_b, hard_pred_b)) # Recall rec_all.append(metrics.recall_score(y_val, hard_pred)) rec_white.append(metrics.recall_score(y_val_w, hard_pred_w)) rec_black.append(metrics.recall_score(y_val_b, hard_pred_b)) # Fbeta fb_all.append(metrics.fbeta_score(y_val, hard_pred, beta=1 / 3)) fb_white.append(metrics.fbeta_score(y_val_w, hard_pred_w, beta=1 / 3)) fb_black.append(metrics.fbeta_score(y_val_b, hard_pred_b, beta=1 / 3)) # Positive Guess Rate pr_all.append(pr) pr_white.append(pr_w) pr_black.append(pr_b) # Positive Guess Rate Ratio p_percent_ratio = min(pr_b / pr_w, pr_w / pr_b) p_percent.append(p_percent_ratio) # Name model to use in DataFrame output names = [name] * 5 out_list = [ names, auc_all, acc_all, prec_all, rec_all, fb_all, pr_all, auc_white, acc_white, prec_white, rec_white, fb_white, pr_white, auc_black, acc_black, prec_black, rec_black, fb_black, pr_black, p_percent ] column_list = [ "names", "auc_all", "acc_all", "prec_all", "rec_all", "fb_all", "pr_all", "auc_white", "acc_white", "prec_white", "rec_white", "fb_white", "pr_white", "auc_black", "acc_black", "prec_black", "rec_black", "fb_black", "pr_black", "p_percent" ] output_df = pd.DataFrame(np.array(out_list).T, columns=column_list) column_num = [n for n in output_df.columns if n not in ['names']] for column in column_num: output_df[column] = pd.to_numeric(output_df[column]) return output_df
def fairness_train_test(estimator, X, y, fairness=True, name="model"): ''' Runs train/test on model given feature set and target variable. Output dataframe with classification metrics split by race. Parameters ---------- estimator : Classification model object Save an instance of any classification model and input X : Array or DataFrame Feature set. Race column must be final column. y : Array or DataFrame Target variable. name : string, optional Name of model used in DataFrame. The default is "model_x". fairness : Boolean, optional True = Apply fairness transformation. The default is True. Returns ------- output_df : DataFrame Returns DataFrame containing classification metrics split by total, Black, and White. ''' # Prepare data for cfinal testing # Selecting random state to be consistent with intitial split for # cross validation testing, and for replicability X, X_test, y, y_test = \ (train_test_split(X,y ,test_size=0.2, random_state=34, stratify=y)) # Standard Scale Scaler = StandardScaler() X_std_scale = Scaler.fit_transform(X) X_test_std_scale = Scaler.transform(X_test) if fairness: # Transform vectors to be orthogonal to race_black = 1 FairFilter = InformationFilter(len(X_std_scale.T) - 1) #scales on last column (race_black) X_scaled = FairFilter.fit_transform(X_std_scale) X_test_scaled = FairFilter.transform(X_test_std_scale) else: X_scaled = X_std_scale X_test_scaled = X_test_std_scale estimator.fit(X_scaled, y) soft_pred = estimator.predict_proba(X_test_scaled)[:, 1] hard_pred = estimator.predict(X_test_scaled) pr_all = sum(hard_pred) / len(hard_pred) mask_white = X_test["race_black"] == 0 y_test_w = y_test[mask_white] soft_pred_w = soft_pred[mask_white] hard_pred_w = hard_pred[mask_white] pr_white = sum(hard_pred_w) / len(hard_pred_w) y_test_b = y_test[~mask_white] soft_pred_b = soft_pred[~mask_white] hard_pred_b = hard_pred[~mask_white] pr_black = sum(hard_pred_b) / len(hard_pred_b) # AUC auc_all = metrics.roc_auc_score(y_test, soft_pred) auc_white = metrics.roc_auc_score(y_test_w, soft_pred_w) auc_black = metrics.roc_auc_score(y_test_b, soft_pred_b) # Accuracy acc_all = metrics.accuracy_score(y_test, hard_pred) acc_white = metrics.accuracy_score(y_test_w, hard_pred_w) acc_black = metrics.accuracy_score(y_test_b, hard_pred_b) # Precision prec_all = metrics.precision_score(y_test, hard_pred) prec_white = metrics.precision_score(y_test_w, hard_pred_w) prec_black = metrics.precision_score(y_test_b, hard_pred_b) # Recall rec_all = metrics.recall_score(y_test, hard_pred) rec_white = metrics.recall_score(y_test_w, hard_pred_w) rec_black = metrics.recall_score(y_test_b, hard_pred_b) # Fbeta fb_all = metrics.fbeta_score(y_test, hard_pred, beta=1 / 3) fb_white = metrics.fbeta_score(y_test_w, hard_pred_w, beta=1 / 3) fb_black = metrics.fbeta_score(y_test_b, hard_pred_b, beta=1 / 3) # Positive Guess Rate Ratio p_percent = min(pr_black / pr_white, pr_white / pr_black) out_list = [[ name, auc_all, acc_all, prec_all, rec_all, fb_all, pr_all, auc_white, acc_white, prec_white, rec_white, fb_white, pr_white, auc_black, acc_black, prec_black, rec_black, fb_black, pr_black, p_percent ]] column_list = [ "name", "auc_all", "acc_all", "prec_all", "rec_all", "fb_all", "pr_all", "auc_white", "acc_white", "prec_white", "rec_white", "fb_white", "pr_white", "auc_black", "acc_black", "prec_black", "rec_black", "fb_black", "pr_black", "p_percent" ] output_df = pd.DataFrame(out_list, columns=column_list) column_num = [n for n in output_df.columns if n not in ['name']] for column in column_num: output_df[column] = pd.to_numeric(output_df[column]) return output_df
# Read in X_train and y_train X = pd.read_pickle("pickles/X_train.p") y = pd.read_pickle("pickles/y_train.p") # Define fbeta scorer, give 1/3 weight to recall vs precision fb_scorer = make_scorer(fbeta_score, beta=1 / 3) # Find best parameters for Logistic Regression params = { 'model__C': [.0001, .001, .01, .1, 1], 'model__penalty': ['l1', 'l2'], 'model__solver': ['lbfgs', 'liblinear'] } pipeline = Pipeline([('std_scale', StandardScaler()), ('info_scale', InformationFilter(4)), ('model', LogisticRegression())]) grid = GridSearchCV(pipeline, cv=5, n_jobs=-1, param_grid=params, scoring=fb_scorer) grid.fit(X, y) grid.best_score_ grid.best_params_ # Find best parameters for KNN params = { 'model__n_neighbors': np.linspace(10, 500, 50, dtype=int), 'model__weights': ['uniform', 'distance']