def train_test_classifier(): w = fdm.train_model_disp_mist(x_train, y_train, x_control_train, loss_function, EPS, cons_params) train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(w, x_train, y_train, x_control_train, x_test, y_test, x_control_test, sensitive_attrs) # accuracy and FPR are for the test because we need of for plotting return w, test_score, s_attr_to_fp_fn_test
def train_test_classifier(): w = fdm.train_model_disp_mist(x_train, y_train, x_control_train, loss_function, EPS, cons_params) train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats( w, x_train, y_train, x_control_train, x_test, y_test, x_control_test, sensitive_attrs) # accuracy and FPR are for the test because we need of for plotting # the covariance is for train, because we need it for setting the thresholds return w, test_score, s_attr_to_fp_fn_test, cov_all_train
def run_for_all_train_test_matrices(file_path, train_config, train_test_matrices, entity_to_attrib, out_path, test_results_table, test_results_schema): print("TRUNCATING........") truncate_query = """ truncate table triage_metadata.zafar_models; """ conn.execute(truncate_query) truncate_query = """ truncate table triage_metadata.zafar_model_groups; """ conn.execute(truncate_query) truncate_query = """ truncate table %s.%s """ % (str(test_results_schema), str(test_results_table)) conn.execute(truncate_query) model_group_id = 9999 df_model_group_insert = pd.DataFrame({ 'model_group_id': [model_group_id], 'model_type': 'zafar_model_group', 'hyperparameters': [Json(train_config)] #'feature_list': [], #'model_config: [] }) df_model_group_insert.to_sql('zafar_model_groups', conn, schema='triage_metadata', index=False, if_exists='append') for i in range(len(train_test_matrices)): print("Doing for " + str(i)) print("--" * 20) train_matrix_uuid = train_test_matrices[i][0][0] test_matrix_uuid = train_test_matrices[i][0][1] out_file = os.path.join(out_path, test_matrix_uuid + "_Scores") if (not os.path.exists(out_file)): df_train = load_matrix(file_path, train_matrix_uuid, entity_to_attrib, demo_col, label_col) # converting as of date df_train['as_of_date'] = pd.to_datetime(df_train['as_of_date']) # getting entities and as of dates train_as_of_dates = df_train['as_of_date'].values train_entity_ids = df_train['entity_id'].values # doing the same for df_test df_test = load_matrix(file_path, test_matrix_uuid, entity_to_attrib, demo_col, label_col) df_test['as_of_date'] = pd.to_datetime(df_test['as_of_date']) test_as_of_dates = df_test['as_of_date'].values test_entity_ids = df_test['entity_id'].values # Adding intercept df_train['intercept'] = 1 df_test['intercept'] = 1 # This filtering of columns is different than for what we will run zafar. old_exclude_cols = ['entity_id', 'as_of_date', label_col] # This is the matrix for which we will run Regression x_temp = df_train[[ c for c in df_train.columns if c not in old_exclude_cols ]].values y_temp = df_train[label_col].values # Perform Scaled Logistic Regression to include only relevant features. dsapp_lr = ScaledLogisticRegression(penalty="l1", C=0.1) dsapp_lr.fit(x_temp, y_temp) all_columns = [ c for c in df_train.columns if c not in old_exclude_cols ] keep_cols = [] for i, col in enumerate(all_columns): if dsapp_lr.coef_[0][i] != 0: keep_cols.append(col) keep_cols = keep_cols + ['intercept'] x_train = df_train[[c for c in df_train.columns if c in keep_cols]].values y_train = df_train[label_col].values x_control_train = {demo_col: df_train[demo_col].values} x_test = df_test[[c for c in df_test.columns if c in keep_cols]].values y_test = df_test[label_col].values x_control_test = {demo_col: df_test[demo_col].values} x = x_train y = y_train x_control = x_control_train max_iters = train_config['max_iters'] max_iters_dccp = train_config['max_iters_dccp'] num_points, num_features = x.shape w = cvxpy.Variable(num_features) np.random.seed(train_config['random_seed']) w.value = np.random.rand(x.shape[1]) constraints = [] loss = cvxpy.sum(cvxpy.logistic(cvxpy.multiply( -y, x * w))) / num_points prob = cvxpy.Problem(cvxpy.Minimize(loss), constraints) tau = float(train_config['tau']) mu = float(train_config['mu']) loss_function = "logreg" # perform the experiments with logistic regression #EPS = 1e-4 EPS = float(train_config['EPS']) prob.solve(method='dccp', tau=tau, mu=mu, tau_max=1e10, solver=cvxpy.ECOS, verbose=True, feastol=EPS, abstol=EPS, reltol=EPS, feastol_inacc=EPS, abstol_inacc=EPS, reltol_inacc=EPS, max_iters=max_iters, max_iter=max_iters_dccp) ret_w = np.array(w.value).flatten() sensitive_attrs = list(x_control_train.keys()) print("INPUT TRAIN:" + str(pd.value_counts(y_train))) print("INPUT TEST:" + str(pd.value_counts(y_test))) train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats( ret_w, x_train, y_train, x_control, x_test, y_test, x_control_test, list(sensitive_attrs)) s_attr = sensitive_attrs[0] distances_boundary_test = fdm.get_distance_boundary( ret_w, x_test, x_control_test[s_attr]) print("SAVING PREDICTIONS") save_predictions(distances_boundary_test, y_test, x_control_test, sensitive_attrs, test_as_of_dates, test_entity_ids, train_matrix_uuid, test_matrix_uuid, train_config, test_results_table, test_results_schema, out_path) k = 500 all_class_labels_assigned_test = label_top_k( distances_boundary_test, k) #all_class_labels_assigned_test = np.sign(distances_boundary_test) prec_k = calc_prec(all_class_labels_assigned_test, y_test) print('prec@%s_abs: %.5f' % (k, prec_k)) s_attr_to_fp_fn_test = fdm.get_fpr_fnr_sensitive_features( y_test, all_class_labels_assigned_test, x_control_test, sensitive_attrs, False) for s_attr in s_attr_to_fp_fn_test.keys(): print("S_Attr=" + str(s_attr)) for s_val in s_attr_to_fp_fn_test[s_attr].keys(): print("S_VAL=" + str(s_val)) s_attr_to_fp_fn_test[s_attr][s_val][ 'recall'] = 1.000 - s_attr_to_fp_fn_test[s_attr][ s_val]['fnr'] #recall_white = s_attr_to_fp_fn_test['race'][0]['recall'] #recall_nonwhite = s_attr_to_fp_fn_test['race'][1]['recall'] #recall_highest = s_attr_to_fp_fn_test['plevel'][0]['recall'] #recall_nonhighest = s_attr_to_fp_fn_test['plevel'][1]['recall'] #recall_overage = s_attr_to_fp_fn_test['ovg'][0]['recall'] #recall_non_overage = s_attr_to_fp_fn_test['ovg'][1]['recall'] recall_under = s_attr_to_fp_fn_test['median_income'][0][ 'recall'] recall_over = s_attr_to_fp_fn_test['median_income'][1][ 'recall'] #print('recall white: %.6f' % recall_white) #print('recall non-white: %.6f' % recall_nonwhite) #print('recall ratio: %.6f' % float(recall_white/recall_nonwhite)) #print('recall highest: %.6f' % recall_highest) #print('recall non-highest: %.6f' % recall_nonhighest) #print('recall ratio: %.6f' % float(recall_highest/(recall_nonhighest+1e-6))) #print('recall overage: %.6f' % recall_overage) #print('recall non overage: %.6f' % recall_non_overage) print('recall under55k: %.6f' % recall_under) print('recall over55k: %.6f' % recall_over) save_output_info(out_path, test_matrix_uuid, train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test, recall_under, recall_over) return None