예제 #1
0
def train_cls(X, y, P, train_rate=0.7, c=0.0):
    """
    Train a classifier.
    The performance of the classifier is evaluated and printed.

    Parameters:
        X: input features.
        y: label.
        P: the protected attribute.
        train_rate: the ratio of the training data.
        c: the parameter specifying the inverse of regularization strength.
    """
    lin_model = nn.Sequential(nn.Linear(len(X[0]), 1), nn.Sigmoid())
    lin_model.cuda()
    optimizer = optim.SGD(lin_model.parameters(), lr=0.01, weight_decay=c)
    train_len = int(train_rate * len(X))

    X = torch.tensor(X).cuda()
    y = torch.tensor(y).float().cuda()
    X_train = X[:train_len]
    y_train = y[:train_len]
    X_test = X[train_len + 1:]
    y_test = y[train_len + 1:]

    for i in range(1000):
        optimizer.zero_grad()
        y_score = lin_model(X_train)
        loss = cross_entropy(y_train, y_score)
        '''
        if c > 0:
            for p in lin_model.critic.parameters():
                p.data.clamp_(-c, c)
        '''

        loss.backward()
        optimizer.step()

    y_train_score = lin_model(X_train).cpu().data.numpy()
    y_test_score = lin_model(X_test).cpu().data.numpy()

    P = np.array(P)
    P_train = P[:train_len]
    P_test = P[train_len + 1:]

    def get_score_ratio(scores, P_):
        scores_pos = sum(scores[P_ == 1]) / sum(P_ == 1)
        scores_neg = sum(scores[P_ == 0]) / sum(P_ == 0)
        print(scores_pos, scores_neg)
        return 1.0 * max(scores_pos, scores_neg) / min(scores_pos, scores_neg)

    print('train fair ratio: ' + str(get_score_ratio(y_train_score, P_train)))
    print('test fair ratio: ' + str(get_score_ratio(y_test_score, P_test)))
    print('train performance: ')
    print(evaluate_performance_sim(y_train.cpu().data.numpy(), y_train_score))
    print('test performance: ')
    print(evaluate_performance_sim(y_test.cpu().data.numpy(), y_test_score))
def get_model_preds(X_train, y_train, P_train, X_test, y_test, P_test, model_name):
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid((X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    y_hats[model_name] = y_test_scores

    print('logistic regression evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    return lin_model, y_test_scores, performance
def get_model_preds(X_train, y_train, P_train, X_test, y_test, P_test,
                    model_name):
    lin_model = RandomForestClassifier()
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    y_hats[model_name] = y_test_scores

    print('logistic regression evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    return lin_model, y_test_scores, performance
 def calc_perf(y_test, y_test_scores, P_test, U, U_0, U_1, U_np, lin_model, X_test, model_name):
     print('logistic regression evaluation...')
     performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
     print('calculating emd...')
     performance.append(emd_method(U_0, U_1))
     print('calculating consistency...')
     performance.append(get_consistency(U_np, lin_model, n_neighbors=k_nbrs, based_on=X_ori_np))
     print('calculating stat diff...')
     performance.append(stat_diff(X_test, P_test, lin_model))
     print('calculating equal odds...')
     performance.append(equal_odds(X_test, y_test, P_test, lin_model))
     make_cal_plot(X_test, y_test, P_test, lin_model, model_name)
     return performance
예제 #5
0
def test_in_one(n_dim,
                batch_size,
                n_iter,
                C,
                alpha,
                compute_emd=True,
                k_nbrs=3,
                emd_method=emd_samples):
    global X, P, y
    # AE.
    model_ae = FairRep(len(X[0]), n_dim)
    #model_ae.cuda()
    X = torch.tensor(X).float()
    P = torch.tensor(P).long()
    train_rep(model_ae,
              0.01,
              X,
              P,
              n_iter,
              10,
              batch_size,
              alpha=0,
              C_reg=0,
              compute_emd=compute_emd,
              adv=False,
              verbose=True)
    # AE_P.
    model_ae_P = FairRep(len(X[0]) - 1, n_dim - 1)
    #model_ae_P.cuda()
    X = torch.tensor(X).float()
    P = torch.tensor(P).long()
    train_rep(model_ae_P,
              0.01,
              X[:, :-1],
              P,
              n_iter,
              10,
              batch_size,
              alpha=0,
              C_reg=0,
              compute_emd=compute_emd,
              adv=False,
              verbose=True)
    # NFR.
    model_nfr = FairRep(len(X[0]), n_dim)
    #model_nfr.cuda()
    X = torch.tensor(X).float()
    P = torch.tensor(P).long()
    train_rep(model_nfr,
              0.01,
              X,
              P,
              n_iter,
              10,
              batch_size,
              alpha=alpha,
              C_reg=0,
              compute_emd=compute_emd,
              adv=True,
              verbose=True)
    results = {}

    print('begin testing.')
    X_ori_np = X.data.cpu().numpy()
    # Original.
    data_train, data_test = split_data_np(
        (X.data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test
    print('logistic regresison on the original...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)
    #print(lin_model.coef_.shape)
    #int(X_train.shape)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    print('logistic regresison evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    print('calculating emd...')
    performance.append(emd_method(X_n, X_u))
    print('calculating consistency...')
    performance.append(
        get_consistency(X.data.cpu().numpy(), lin_model, n_neighbors=k_nbrs))
    print('calculating stat diff...')
    performance.append(stat_diff(X.data.cpu().numpy(), P, lin_model))
    results['Original'] = performance
    # Original-P.
    data_train, data_test = split_data_np(
        (X[:, :-1].data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test
    print('logistic regresison on the original-P')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    print('logistic regresison evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    print('calculating emd...')
    performance.append(emd_method(X_n[:, :-1], X_u[:, :-1]))
    print('calculating consistency...')
    performance.append(
        get_consistency(X[:, :-1].data.cpu().numpy(),
                        lin_model,
                        n_neighbors=k_nbrs))
    print('calculating stat diff...')
    performance.append(stat_diff(X[:, :-1].data.cpu().numpy(), P, lin_model))
    results['Original-P'] = (performance)
    U_0 = model_ae.encoder(X[P == 0]).data
    U_1 = model_ae.encoder(X[P == 1]).data
    U = model_ae.encoder(X).data
    print('ae emd afterwards: ' + str(emd_method(U_0, U_1)))
    U_np = U.cpu().numpy()
    data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test

    print('logistic regresison on AE...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    print('logistic regresison evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    print('calculating emd...')
    performance.append(emd_method(U_0, U_1))
    print('calculating consistency...')
    performance.append(
        get_consistency(U_np, lin_model, n_neighbors=k_nbrs,
                        based_on=X_ori_np))
    print('calculating stat diff...')
    performance.append(stat_diff(X_test, P_test, lin_model))
    results['AE'] = (performance)

    U_0 = model_ae_P.encoder(X[:, :-1][P == 0]).data
    U_1 = model_ae_P.encoder(X[:, :-1][P == 1]).data
    U = model_ae_P.encoder(X[:, :-1]).data
    print('ae-p emd afterwards: ' + str(emd_method(U_0, U_1)))
    U_np = U.cpu().numpy()
    data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test

    print('logistic regresison on AE-P...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    print('logistic regresison evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    print('calculating emd...')
    performance.append(emd_method(U_0, U_1))
    print('calculating consistency...')
    performance.append(
        get_consistency(U_np, lin_model, n_neighbors=k_nbrs,
                        based_on=X_ori_np))
    print('calculating stat diff...')
    performance.append(stat_diff(X_test, P_test, lin_model))
    results['AE_P'] = (performance)

    U_0 = model_nfr.encoder(X[P == 0]).data
    U_1 = model_nfr.encoder(X[P == 1]).data
    U = model_nfr.encoder(X).data
    print('nfr emd afterwards: ' + str(emd_method(U_0, U_1)))

    U_np = U.cpu().numpy()
    data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test
    print('logistic regresison on NFR...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    print('logistic regresison evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    print('calculating emd...')
    performance.append(emd_method(U_0, U_1))
    print('calculating consistency...')
    performance.append(
        get_consistency(U_np, lin_model, n_neighbors=k_nbrs,
                        based_on=X_ori_np))
    print('calculating stat diff...')
    performance.append(stat_diff(X_test, P_test, lin_model))
    results['NFR'] = (performance)

    return results
def main():
    with open('data/adult.data.processed','r') as f:
        data = np.array([[float(x) for x in y.split()] for y in f.readlines()])

    P_col = 7
    P = data[:, P_col]
    y = data[:, -1]
    X = data[:, :-1]

    X = normalize(X, 20)

    print('number of unique class in the protected attribute is {0}'.format(len(set(P))))

    model = FairRepMulti(len(X[0]), 10, len(set(P)))
    model.encoder = nn.Sequential(nn.Linear(len(X[0]),10),
                                  nn.ReLU(),
                                  nn.Linear(10,10))

    model.decoder = nn.Sequential(nn.Linear(10,10),
                                  nn.ReLU(),
                                  nn.Linear(10,13))

    lr = 0.01
    optim_encoder = optim.Adam(model.encoder.parameters(), lr=lr)
    optim_decoder = optim.Adam(model.decoder.parameters(), lr=lr)
    optim_critic = []
    for i, t in enumerate(model.critic):
        optim_critic.append(optim.Adam(model.critic[i].parameters(), lr=lr))

    num_epoch = 200
    batch_size = 1000

    X_groups = []
    P_uni = sorted(list(set(P)))
    for i, p in enumerate(P_uni):
        X_groups.append(X[P==p])

    X_groups_lens = list(map(len, X_groups))
    min_len_required = 5*batch_size
    for i, x in enumerate(X_groups_lens):
        if x < min_len_required:
            X_groups[i] = X_groups[i][
                np.random.choice(len(X_groups[i]), min_len_required)
            ]

    print('length of each group:')
    print(list(map(len, X_groups)))


    X_groups_lens = list(map(len, X_groups))
    X_size = sum(X_groups_lens)
    num_iter = int(X_size / batch_size) * num_epoch

    use_cuda = True
    if use_cuda:
        model.cuda()

    cur_batch_stop = np.zeros(len(P_uni)).astype(int)
    alpha = 1000

    print_interval = 200
    print('number of total iterations: ' + str(num_iter))
    wdists_catch = np.zeros(len(P_uni))

    for i_iter in range(num_iter):
        optim_decoder.zero_grad()
        optim_encoder.zero_grad()
        for op in optim_critic:
            op.zero_grad()

        i = int(i_iter/10) % len(P_uni)
        x_g = X_groups[i]

        right_stop = min(len(x_g), cur_batch_stop[i] + batch_size)

        x_batch = x_g[cur_batch_stop[i]: right_stop]
        cur_batch_stop[i] = right_stop % len(x_g)

        x_rest_idx = np.random.choice(
            np.arange(len(X))[P != P_uni[i]],
            len(x_batch))
        x_rest = X[x_rest_idx]

        x_batch = torch.tensor(x_batch).float()
        x_rest = torch.tensor(x_rest).float()
        if use_cuda:
            x_batch = x_batch.cuda()
            x_rest = x_rest.cuda()

        for _ in range(10):
            optim_critic[i].zero_grad()
            wdist_neg = -model.wdist(x_batch, x_rest, i)
            wdist_neg.backward(retain_graph=True)
            optim_critic[i].step()

            for pa in model.critic[i].parameters():
                pa.data.clamp_(-0.01, 0.01)

        mse, wdists = model.forward(x_batch, x_rest, i)
        wdists_catch[i] = wdists
        loss = mse + 1000 * wdists
        loss.backward(retain_graph=True)
        optim_encoder.step()
        optim_decoder.step()

        if i_iter % print_interval == print_interval-1:
            print('[{0}/{1}] mse: {2} wdists: [{3}]'.format(
                i_iter, num_iter, mse.item(),
                ' '.join([str(w.item()) for w in wdists_catch])
            ))

    X_torch = torch.tensor(X).float()
    if use_cuda:
        X_torch = X_torch.cuda()
    U = model.encoder(X_torch)
    U = U.data.cpu().numpy()
    del X_torch

    print("let's see origin one-vs-all emds.")
    for p in P_uni:
        x_p = X[P==p]
        x_rest = X[P!=p]
        print(cal_emd_resamp(x_p, x_rest, 100, 10))

    print("let's see afterward one-vs-all emds.")
    for p in P_uni:
        x_p = U[P==p]
        x_rest = U[P!=p]
        print(cal_emd_resamp(x_p, x_rest, 100, 10))

    print("let's see now the classification performance and statistical pairty.")
    print("all is performed on the whole training set.")

    lin_cls_ori = linear_model.LogisticRegression(C=0.1)
    lin_cls_adv = linear_model.LogisticRegression(C=0.1)

    train_test_split = int(0.7 * len(X))
    X_train = X[:train_test_split]
    U_train = U[:train_test_split]
    y_train = y[:train_test_split]
    P_train = P[:train_test_split]

    X_test = X[train_test_split+1:]
    U_test = U[train_test_split+1:]
    y_test = y[train_test_split+1:]
    P_test = P[train_test_split+1:]

    lin_cls_ori.fit(X_train, y_train)
    lin_cls_adv.fit(U_train, y_train)

    y_pred_ori = lin_cls_ori.predict_proba(X_test)[:,1]
    y_pred_adv = lin_cls_adv.predict_proba(U_test)[:,1]

    print("original performance (ks, recall, precision, f1):")
    print(evaluate_performance_sim(y_test, y_pred_ori))
    print("fair rep performance (ks, recall, precision, f1):")
    print(evaluate_performance_sim(y_test, y_pred_adv))

    print("P's: " + str(P_uni))
    avg_score_ori = []
    avg_score_adv = []
    for p in P_uni:
        avg_score_ori.append(1.0*y_pred_ori[P_test==p].sum()/(P_test==p).sum())
        avg_score_adv.append(1.0*y_pred_adv[P_test==p].sum()/(P_test==p).sum())
    print("original avg scores:")
    print(avg_score_ori)
    print("fair rep avg scores:")
    print(avg_score_adv)

    print("original parity: " + str(max(avg_score_ori)/min(avg_score_ori)))
    print("fair rep parity: " + str(max(avg_score_adv)/min(avg_score_adv)))
예제 #7
0
ytest_sensitive_pred = ytest_sensitive_pred.flatten()
ytest_sensitive_pred = list(ytest_sensitive_pred)

ytest_nonsensitive_pred = ytest_nonsensitive_pred.flatten()
ytest_nonsensitive_pred = list(ytest_nonsensitive_pred)

pred = ytest_sensitive_pred + ytest_nonsensitive_pred
# len(pred)

# P_label = np.ones()
P_sensitive = list(np.ones(len(ytest_sensitive)))
P_nonsensitive = list(np.zeros(len(ytest_nonsensitive)))
P = P_sensitive + P_nonsensitive
# len(P)

KS, recall, precision, f1, parity_dev, parity_sub, event_rate = evaluate_performance_sim(
    np.array(target), np.array(pred), np.array(P), more_eva=1)

result = [KS, recall, precision, f1, parity_dev, parity_sub]
# print result
print('threshold: ' + str(np.round(event_rate, 4)))
from pyemd import emd_samples


def cal_emd_resamp(A, B, n_samp, times):
    emds = []
    for t in range(times):
        idx_a = np.random.choice(len(A), n_samp)
        idx_b = np.random.choice(len(B), n_samp)
        emds.append(emd_samples(A[idx_a], B[idx_b]))
    return np.mean(emds)