def train_cls(X, y, P, train_rate=0.7, c=0.0): """ Train a classifier. The performance of the classifier is evaluated and printed. Parameters: X: input features. y: label. P: the protected attribute. train_rate: the ratio of the training data. c: the parameter specifying the inverse of regularization strength. """ lin_model = nn.Sequential(nn.Linear(len(X[0]), 1), nn.Sigmoid()) lin_model.cuda() optimizer = optim.SGD(lin_model.parameters(), lr=0.01, weight_decay=c) train_len = int(train_rate * len(X)) X = torch.tensor(X).cuda() y = torch.tensor(y).float().cuda() X_train = X[:train_len] y_train = y[:train_len] X_test = X[train_len + 1:] y_test = y[train_len + 1:] for i in range(1000): optimizer.zero_grad() y_score = lin_model(X_train) loss = cross_entropy(y_train, y_score) ''' if c > 0: for p in lin_model.critic.parameters(): p.data.clamp_(-c, c) ''' loss.backward() optimizer.step() y_train_score = lin_model(X_train).cpu().data.numpy() y_test_score = lin_model(X_test).cpu().data.numpy() P = np.array(P) P_train = P[:train_len] P_test = P[train_len + 1:] def get_score_ratio(scores, P_): scores_pos = sum(scores[P_ == 1]) / sum(P_ == 1) scores_neg = sum(scores[P_ == 0]) / sum(P_ == 0) print(scores_pos, scores_neg) return 1.0 * max(scores_pos, scores_neg) / min(scores_pos, scores_neg) print('train fair ratio: ' + str(get_score_ratio(y_train_score, P_train))) print('test fair ratio: ' + str(get_score_ratio(y_test_score, P_test))) print('train performance: ') print(evaluate_performance_sim(y_train.cpu().data.numpy(), y_train_score)) print('test performance: ') print(evaluate_performance_sim(y_test.cpu().data.numpy(), y_test_score))
def get_model_preds(X_train, y_train, P_train, X_test, y_test, P_test, model_name): lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) y_test_scores = sigmoid((X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) y_hats[model_name] = y_test_scores print('logistic regression evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) return lin_model, y_test_scores, performance
def get_model_preds(X_train, y_train, P_train, X_test, y_test, P_test, model_name): lin_model = RandomForestClassifier() lin_model.fit(X_train, y_train) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) y_hats[model_name] = y_test_scores print('logistic regression evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) return lin_model, y_test_scores, performance
def calc_perf(y_test, y_test_scores, P_test, U, U_0, U_1, U_np, lin_model, X_test, model_name): print('logistic regression evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(U_0, U_1)) print('calculating consistency...') performance.append(get_consistency(U_np, lin_model, n_neighbors=k_nbrs, based_on=X_ori_np)) print('calculating stat diff...') performance.append(stat_diff(X_test, P_test, lin_model)) print('calculating equal odds...') performance.append(equal_odds(X_test, y_test, P_test, lin_model)) make_cal_plot(X_test, y_test, P_test, lin_model, model_name) return performance
def test_in_one(n_dim, batch_size, n_iter, C, alpha, compute_emd=True, k_nbrs=3, emd_method=emd_samples): global X, P, y # AE. model_ae = FairRep(len(X[0]), n_dim) #model_ae.cuda() X = torch.tensor(X).float() P = torch.tensor(P).long() train_rep(model_ae, 0.01, X, P, n_iter, 10, batch_size, alpha=0, C_reg=0, compute_emd=compute_emd, adv=False, verbose=True) # AE_P. model_ae_P = FairRep(len(X[0]) - 1, n_dim - 1) #model_ae_P.cuda() X = torch.tensor(X).float() P = torch.tensor(P).long() train_rep(model_ae_P, 0.01, X[:, :-1], P, n_iter, 10, batch_size, alpha=0, C_reg=0, compute_emd=compute_emd, adv=False, verbose=True) # NFR. model_nfr = FairRep(len(X[0]), n_dim) #model_nfr.cuda() X = torch.tensor(X).float() P = torch.tensor(P).long() train_rep(model_nfr, 0.01, X, P, n_iter, 10, batch_size, alpha=alpha, C_reg=0, compute_emd=compute_emd, adv=True, verbose=True) results = {} print('begin testing.') X_ori_np = X.data.cpu().numpy() # Original. data_train, data_test = split_data_np( (X.data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regresison on the original...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) #print(lin_model.coef_.shape) #int(X_train.shape) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) print('logistic regresison evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(X_n, X_u)) print('calculating consistency...') performance.append( get_consistency(X.data.cpu().numpy(), lin_model, n_neighbors=k_nbrs)) print('calculating stat diff...') performance.append(stat_diff(X.data.cpu().numpy(), P, lin_model)) results['Original'] = performance # Original-P. data_train, data_test = split_data_np( (X[:, :-1].data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regresison on the original-P') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) print('logistic regresison evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(X_n[:, :-1], X_u[:, :-1])) print('calculating consistency...') performance.append( get_consistency(X[:, :-1].data.cpu().numpy(), lin_model, n_neighbors=k_nbrs)) print('calculating stat diff...') performance.append(stat_diff(X[:, :-1].data.cpu().numpy(), P, lin_model)) results['Original-P'] = (performance) U_0 = model_ae.encoder(X[P == 0]).data U_1 = model_ae.encoder(X[P == 1]).data U = model_ae.encoder(X).data print('ae emd afterwards: ' + str(emd_method(U_0, U_1))) U_np = U.cpu().numpy() data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regresison on AE...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) print('logistic regresison evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(U_0, U_1)) print('calculating consistency...') performance.append( get_consistency(U_np, lin_model, n_neighbors=k_nbrs, based_on=X_ori_np)) print('calculating stat diff...') performance.append(stat_diff(X_test, P_test, lin_model)) results['AE'] = (performance) U_0 = model_ae_P.encoder(X[:, :-1][P == 0]).data U_1 = model_ae_P.encoder(X[:, :-1][P == 1]).data U = model_ae_P.encoder(X[:, :-1]).data print('ae-p emd afterwards: ' + str(emd_method(U_0, U_1))) U_np = U.cpu().numpy() data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regresison on AE-P...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) print('logistic regresison evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(U_0, U_1)) print('calculating consistency...') performance.append( get_consistency(U_np, lin_model, n_neighbors=k_nbrs, based_on=X_ori_np)) print('calculating stat diff...') performance.append(stat_diff(X_test, P_test, lin_model)) results['AE_P'] = (performance) U_0 = model_nfr.encoder(X[P == 0]).data U_1 = model_nfr.encoder(X[P == 1]).data U = model_nfr.encoder(X).data print('nfr emd afterwards: ' + str(emd_method(U_0, U_1))) U_np = U.cpu().numpy() data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7) X_train, P_train, y_train = data_train X_test, P_test, y_test = data_test print('logistic regresison on NFR...') lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000) lin_model.fit(X_train, y_train) y_test_scores = sigmoid( (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten()) print('logistic regresison evaluation...') performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test)) print('calculating emd...') performance.append(emd_method(U_0, U_1)) print('calculating consistency...') performance.append( get_consistency(U_np, lin_model, n_neighbors=k_nbrs, based_on=X_ori_np)) print('calculating stat diff...') performance.append(stat_diff(X_test, P_test, lin_model)) results['NFR'] = (performance) return results
def main(): with open('data/adult.data.processed','r') as f: data = np.array([[float(x) for x in y.split()] for y in f.readlines()]) P_col = 7 P = data[:, P_col] y = data[:, -1] X = data[:, :-1] X = normalize(X, 20) print('number of unique class in the protected attribute is {0}'.format(len(set(P)))) model = FairRepMulti(len(X[0]), 10, len(set(P))) model.encoder = nn.Sequential(nn.Linear(len(X[0]),10), nn.ReLU(), nn.Linear(10,10)) model.decoder = nn.Sequential(nn.Linear(10,10), nn.ReLU(), nn.Linear(10,13)) lr = 0.01 optim_encoder = optim.Adam(model.encoder.parameters(), lr=lr) optim_decoder = optim.Adam(model.decoder.parameters(), lr=lr) optim_critic = [] for i, t in enumerate(model.critic): optim_critic.append(optim.Adam(model.critic[i].parameters(), lr=lr)) num_epoch = 200 batch_size = 1000 X_groups = [] P_uni = sorted(list(set(P))) for i, p in enumerate(P_uni): X_groups.append(X[P==p]) X_groups_lens = list(map(len, X_groups)) min_len_required = 5*batch_size for i, x in enumerate(X_groups_lens): if x < min_len_required: X_groups[i] = X_groups[i][ np.random.choice(len(X_groups[i]), min_len_required) ] print('length of each group:') print(list(map(len, X_groups))) X_groups_lens = list(map(len, X_groups)) X_size = sum(X_groups_lens) num_iter = int(X_size / batch_size) * num_epoch use_cuda = True if use_cuda: model.cuda() cur_batch_stop = np.zeros(len(P_uni)).astype(int) alpha = 1000 print_interval = 200 print('number of total iterations: ' + str(num_iter)) wdists_catch = np.zeros(len(P_uni)) for i_iter in range(num_iter): optim_decoder.zero_grad() optim_encoder.zero_grad() for op in optim_critic: op.zero_grad() i = int(i_iter/10) % len(P_uni) x_g = X_groups[i] right_stop = min(len(x_g), cur_batch_stop[i] + batch_size) x_batch = x_g[cur_batch_stop[i]: right_stop] cur_batch_stop[i] = right_stop % len(x_g) x_rest_idx = np.random.choice( np.arange(len(X))[P != P_uni[i]], len(x_batch)) x_rest = X[x_rest_idx] x_batch = torch.tensor(x_batch).float() x_rest = torch.tensor(x_rest).float() if use_cuda: x_batch = x_batch.cuda() x_rest = x_rest.cuda() for _ in range(10): optim_critic[i].zero_grad() wdist_neg = -model.wdist(x_batch, x_rest, i) wdist_neg.backward(retain_graph=True) optim_critic[i].step() for pa in model.critic[i].parameters(): pa.data.clamp_(-0.01, 0.01) mse, wdists = model.forward(x_batch, x_rest, i) wdists_catch[i] = wdists loss = mse + 1000 * wdists loss.backward(retain_graph=True) optim_encoder.step() optim_decoder.step() if i_iter % print_interval == print_interval-1: print('[{0}/{1}] mse: {2} wdists: [{3}]'.format( i_iter, num_iter, mse.item(), ' '.join([str(w.item()) for w in wdists_catch]) )) X_torch = torch.tensor(X).float() if use_cuda: X_torch = X_torch.cuda() U = model.encoder(X_torch) U = U.data.cpu().numpy() del X_torch print("let's see origin one-vs-all emds.") for p in P_uni: x_p = X[P==p] x_rest = X[P!=p] print(cal_emd_resamp(x_p, x_rest, 100, 10)) print("let's see afterward one-vs-all emds.") for p in P_uni: x_p = U[P==p] x_rest = U[P!=p] print(cal_emd_resamp(x_p, x_rest, 100, 10)) print("let's see now the classification performance and statistical pairty.") print("all is performed on the whole training set.") lin_cls_ori = linear_model.LogisticRegression(C=0.1) lin_cls_adv = linear_model.LogisticRegression(C=0.1) train_test_split = int(0.7 * len(X)) X_train = X[:train_test_split] U_train = U[:train_test_split] y_train = y[:train_test_split] P_train = P[:train_test_split] X_test = X[train_test_split+1:] U_test = U[train_test_split+1:] y_test = y[train_test_split+1:] P_test = P[train_test_split+1:] lin_cls_ori.fit(X_train, y_train) lin_cls_adv.fit(U_train, y_train) y_pred_ori = lin_cls_ori.predict_proba(X_test)[:,1] y_pred_adv = lin_cls_adv.predict_proba(U_test)[:,1] print("original performance (ks, recall, precision, f1):") print(evaluate_performance_sim(y_test, y_pred_ori)) print("fair rep performance (ks, recall, precision, f1):") print(evaluate_performance_sim(y_test, y_pred_adv)) print("P's: " + str(P_uni)) avg_score_ori = [] avg_score_adv = [] for p in P_uni: avg_score_ori.append(1.0*y_pred_ori[P_test==p].sum()/(P_test==p).sum()) avg_score_adv.append(1.0*y_pred_adv[P_test==p].sum()/(P_test==p).sum()) print("original avg scores:") print(avg_score_ori) print("fair rep avg scores:") print(avg_score_adv) print("original parity: " + str(max(avg_score_ori)/min(avg_score_ori))) print("fair rep parity: " + str(max(avg_score_adv)/min(avg_score_adv)))
ytest_sensitive_pred = ytest_sensitive_pred.flatten() ytest_sensitive_pred = list(ytest_sensitive_pred) ytest_nonsensitive_pred = ytest_nonsensitive_pred.flatten() ytest_nonsensitive_pred = list(ytest_nonsensitive_pred) pred = ytest_sensitive_pred + ytest_nonsensitive_pred # len(pred) # P_label = np.ones() P_sensitive = list(np.ones(len(ytest_sensitive))) P_nonsensitive = list(np.zeros(len(ytest_nonsensitive))) P = P_sensitive + P_nonsensitive # len(P) KS, recall, precision, f1, parity_dev, parity_sub, event_rate = evaluate_performance_sim( np.array(target), np.array(pred), np.array(P), more_eva=1) result = [KS, recall, precision, f1, parity_dev, parity_sub] # print result print('threshold: ' + str(np.round(event_rate, 4))) from pyemd import emd_samples def cal_emd_resamp(A, B, n_samp, times): emds = [] for t in range(times): idx_a = np.random.choice(len(A), n_samp) idx_b = np.random.choice(len(B), n_samp) emds.append(emd_samples(A[idx_a], B[idx_b])) return np.mean(emds)