def decision_tree(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train a Decision Tree classifier on x_train and predict on x_test. x_train, x_test: DataFrames of shape num_samples x num_features. ''' from sklearn.tree import DecisionTreeClassifier classWeights = {defs.posCode: 0.5, defs.negCode: 0.5} model = DecisionTreeClassifier(class_weight='balanced', criterion='entropy', max_depth=15, min_samples_leaf=5) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) else: predTest = model.predict(x_test) return predTest, metricsCV, model
def svm(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train SVM classifier on x_train and predict on x_test. x_train, x_test: DataFrames of shape data x features. ''' from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC # TODO: Experiment with 'weights' parameter # classWeights = {defs.posCode: 0.5, defs.negCode: 0.5} svm = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,tol=0.001, verbose=False) # print("\nParameters initialization:") # print(percep.coef_) svm.fit(x_train, y_train)#, weights) # TODO: Add class weights if compute_threshold is True: probTest = svm.predict_proba(x_test) probTrain = svm.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) predTrain = np.where(probTrain[:, 1] >= bestThresh, defs.posCode, defs.negCode) else: predTest = svm.predict(x_test) predTrain = svm.predict(x_train) return predTest, predTrain, svm
def gaussian_naive_bayes(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train Naive Bayes classifier on x_train and predict on x_test x_train, x_test: DataFrames of shape (data x features) ''' from sklearn.naive_bayes import GaussianNB model = GaussianNB(priors=None) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) plot_roc_curve(y_test, probTest, modelName="Naive Bayes") else: predTest = model.predict(x_test) return predTest, metricsCV, model
def quadratic_discriminant_analysis(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train Quadratic Discriminant Analysis (LDA) classifier on x_train and predict on x_test. x_train, x_test: DataFrames of shape data x features. n_components: Number of components (< n_classes - 1) for dimensionality reduction. ''' from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis # classWeights = {defs.posCode: 0.5, defs.negCode: 0.5} model = QuadraticDiscriminantAnalysis() #X_r2 = model.fit(x_train, y_train).transform(X) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) else: predTest = model.predict(x_test) return predTest, metricsCV, model
def nearest_neighbours(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train K-Nearest Neighbours classifier on x_train and predict on x_test. x_train, x_test: DataFrames of shape data x features. ''' from sklearn.neighbors import KNeighborsClassifier # TODO: Experiment with 'weights' parameter # classWeights = {defs.posCode: 0.5, defs.negCode: 0.5} model = KNeighborsClassifier( n_neighbors=3, algorithm='ball_tree', weights='uniform', p=2, metric='minkowski', n_jobs=-1) # print("\nParameters initialization:") # print(percep.coef_) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train)#, weights) # TODO: Add class weights if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) else: predTest = model.predict(x_test) return predTest, metricsCV, model
def log_reg(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train Logistic Regression classifier on x_train and predict on x_test. x_train, x_test: DataFrames of shape data x features. thres: Class discrimination threshold ''' from sklearn.linear_model import LogisticRegression modelName = "Logistic Regression" model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear', n_jobs=1, max_iter=100) #metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) metricsCV = 0 model.fit(x_train, y_train)#, weights) # TODO: Add class weights if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) # predTrain = np.where(probTrain[:, 1] >= bestThresh, defs.posCode, defs.negCode) plot_roc_curve(y_test, probTest, modelName="Logistic Regression") else: predTest = model.predict(x_test) # predTrain = model.predict(x_train) return predTest, metricsCV, model
def ada_boost(x_train, y_train, x_test, y_test, compute_threshold=False): ''' Train an AdaBoost ensemble of Decision Trees on x_train and predict on x_test. x_train, x_test: DataFrames of shape num_samples x num_features. ''' from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression # estimator = DecisionTreeClassifier(class_weight='balanced', criterion='entropy', max_depth=7, min_samples_leaf=5) estimator = LogisticRegression(penalty='l2', C=1.0, solver='liblinear', n_jobs=1, max_iter=100) model = AdaBoostClassifier(base_estimator=estimator, n_estimators=60, learning_rate=1.0) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) # probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) plot_roc_curve(y_test, probTest, modelName="AdaBoost") else: predTest = model.predict(x_test) return predTest, metricsCV, model
def evaluate(lr, beta1, beta2, alpha, T0, verbose=False): model = load_model(data.num_features, config.get('hyperparameters', {})) model.load_state_dict(model_state_dict) model.to(device) loss_fn = torch.nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=alpha) scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, int(T0)) for epoch in range(201): model.train() batch_idxs = torch.split(torch.randperm(data.X_valid.size(0)), 64) train_loss = 0 for batch in batch_idxs: X = data.X_valid_gpu[batch, :] y = data.y_valid_gpu[batch] optimizer.zero_grad() loss = loss_fn(model(X)[:, 0], y) loss.backward() train_loss += loss.item() optimizer.step() scheduler.step(X.size(0)) if epoch % 10 == 0 and verbose: model.eval() with torch.no_grad(): valid_loss = loss_fn( model(data.X_valid_valid.to(device))[:, 0], data.y_valid_valid.to(device)) print( f'=======> Epoch: {epoch} Train loss: {train_loss / len(batch_idxs)} ' f'Valid loss: {valid_loss}') model.eval() with torch.no_grad(): scores = model(data.X_valid_gpu)[:, 0].reshape(-1).cpu().numpy() best_thresh, _ = get_best_thresh(scores, np.linspace(0, 1, 1001), data, config, valid=False, margin=config['fairBO']['margin']) return get_valid_objective(scores > best_thresh, data, config, valid=False), model, best_thresh
def objective(new_param, return_thresh=False): param.data[indices] = torch.tensor(new_param) base_model.eval() with torch.no_grad(): scores = base_model(data.X_valid_gpu)[:, 0].reshape(-1).numpy() best_thresh, best_obj = get_best_thresh( scores, np.linspace(0, 1, 501), data, config, valid=False, margin=config['layerwiseOpt']['margin']) print(f'Evaluating param number {index} of {total_params}') if return_thresh: return -float(best_obj), float(best_thresh) return -float(best_obj)
def svm(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train SVM classifier on x_train and predict on x_test. x_train, x_test: DataFrames of shape data x features. ''' from sklearn.svm import SVC model = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) plot_roc_curve(y_test, probTest, modelName="SVM") else: predTest = model.predict(x_test) return predTest, metricsCV, model
def linear_svm(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train linear SVM classifier on x_train and predict on x_test. x_train, x_test: DataFrames of shape data x features. ''' from sklearn.svm import LinearSVC model = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=0, tol=0.0001, verbose=0) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) plot_roc_curve(y_test, probTest, modelName="SVM") else: predTest = model.predict(x_test) return predTest, metricsCV, model
def random_forest(x_train, y_train, x_test, y_test, compute_threshold=True): ''' Train an ensemble of Decision Trees on x_train and predict on x_test. x_train, x_test: DataFrames of shape num_samples x num_features. ''' from sklearn.ensemble import RandomForestClassifier classWeights = {defs.posCode: 0.5, defs.negCode: 0.5} model = RandomForestClassifier(n_estimators=50, class_weight='balanced', criterion='entropy', max_depth=15, min_samples_leaf=5, n_jobs=-1) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) plot_roc_curve(y_test, probTest, modelName="Random Forest") else: predTest = model.predict(x_test) return predTest, metricsCV, model
def random_debiasing(model_state_dict, data, config, device, verbose=True): logger.info('Generating Random Debiased models.') rand_model = load_model(data.num_features, config.get('hyperparameters', {})) rand_model.to(device) rand_result = { 'objective': -math.inf, 'model': rand_model.state_dict(), 'thresh': -1 } for iteration in range(config['random']['num_trials']): rand_model.load_state_dict(model_state_dict) for param in rand_model.parameters(): param.data = param.data * ( torch.randn_like(param) * config['random']['stddev'] + 1) rand_model.eval() with torch.no_grad(): scores = rand_model(data.X_valid_gpu)[:, 0].reshape(-1).cpu().numpy() threshs = np.linspace(0, 1, 501) best_rand_thresh, best_obj = get_best_thresh( scores, threshs, data, config, valid=False, margin=config['random']['margin']) if best_obj > rand_result['objective']: rand_result = { 'objective': best_obj, 'model': copy.deepcopy(rand_model.state_dict()), 'thresh': best_rand_thresh } rand_model.eval() with torch.no_grad(): y_pred = (rand_model(data.X_test_gpu)[:, 0] > best_rand_thresh).reshape(-1).cpu().numpy() best_test_result = get_test_objective(y_pred, data, config)['objective'] if iteration % 10 == 0 and verbose: logger.info( f'{iteration} / {config["random"]["num_trials"]} trials have been sampled.' ) logger.info(f'Best result so far = {rand_result["objective"]}') logger.info(f'Best test result so = {best_test_result}') logger.info('Evaluating best random debiased model.') rand_model.load_state_dict(rand_result['model']) rand_model.eval() with torch.no_grad(): y_pred = (rand_model(data.X_valid_gpu)[:, 0] > rand_result['thresh']).reshape(-1).cpu().numpy() results_valid = get_valid_objective(y_pred, data, config) logger.info(f'Results: {results_valid}') rand_model.eval() with torch.no_grad(): y_pred = (rand_model(data.X_test_gpu)[:, 0] > rand_result['thresh']).reshape(-1).cpu().numpy() results_test = get_test_objective(y_pred, data, config) return results_valid, results_test
def mlp(x_train, y_train, x_test, y_test, hidden_neurons=10, hidden_activation='tanh', output_activation='tanh', lossFunction='mean_squared_error', optmizer='Adam', metrics=['mae', 'mape', 'acc', 'categorical_accuracy'], patience=30, train_verbose=2, n_epochs=500): ''' Neural Networks classifier. x_train, x_test: DataFrames of shape data x features. ''' from keras.models import Sequential from keras.layers import Dense import keras.callbacks as callbacks model = Sequential() model.add( Dense(hidden_neurons, input_dim=x_train.shape[1], activation=hidden_activation)) model.add(Dense(1, activation=output_activation)) model.compile(loss=lossFunction, optimizer=optmizer, metrics=metrics) earlyStopping = callbacks.EarlyStopping(monitor='val_loss', patience=patience, verbose=train_verbose, mode='auto') history = model.fit(x_train, y_train, epochs=n_epochs, callbacks=[earlyStopping], verbose=train_verbose, validation_data=(x_test, y_test)) metricsCV = cross_val_analysis(classifier=model, x=x_train, y=y_train, plot=False) model.fit(x_train, y_train) if compute_threshold is True: probTest = model.predict_proba(x_test) probTrain = model.predict_proba(x_train) bestThresh = get_best_thresh(y_train, probTrain) predTest = np.where(probTest[:, 1] >= bestThresh, defs.posCode, defs.negCode) plot_roc_curve(y_test, probTest, modelName="SVM") else: predTest = model.predict(x_test) return predTest, metricsCV, model
def adversarial_debiasing(model_state_dict, data, config, device): logger.info('Training Adversarial model.') actor = load_model(data.num_features, config.get('hyperparameters', {})) actor.load_state_dict(model_state_dict) actor.to(device) hid = config['hyperparameters'][ 'hid'] if 'hyperparameters' in config else 32 critic = Critic(hid * config['adversarial']['batch_size'], num_deep=config['adversarial']['num_deep'], hid=hid) critic.to(device) critic_optimizer = optim.Adam(critic.parameters()) critic_loss_fn = torch.nn.MSELoss() actor_optimizer = optim.Adam(actor.parameters(), lr=config['adversarial']['lr']) actor_loss_fn = torch.nn.BCELoss() for epoch in range(config['adversarial']['epochs']): for param in critic.parameters(): param.requires_grad = True for param in actor.parameters(): param.requires_grad = False actor.eval() critic.train() for step in range(config['adversarial']['critic_steps']): critic_optimizer.zero_grad() indices = torch.randint(0, data.X_valid.size(0), (config['adversarial']['batch_size'], )) cX_valid = data.X_valid_gpu[indices] cy_valid = data.y_valid[indices] cp_valid = data.p_valid[indices] with torch.no_grad(): scores = actor(cX_valid)[:, 0].reshape(-1).cpu().numpy() bias = compute_bias(scores, cy_valid.numpy(), cp_valid, config['metric']) res = critic(actor.trunc_forward(cX_valid)) loss = critic_loss_fn(torch.tensor([bias], device=device), res[0]) loss.backward() train_loss = loss.item() critic_optimizer.step() if (epoch % 10 == 0) and (step % 100 == 0): logger.info( f'=======> Critic Epoch: {(epoch, step)} loss: {train_loss}' ) for param in critic.parameters(): param.requires_grad = False for param in actor.parameters(): param.requires_grad = True actor.train() critic.eval() for step in range(config['adversarial']['actor_steps']): actor_optimizer.zero_grad() indices = torch.randint(0, data.X_valid.size(0), (config['adversarial']['batch_size'], )) cy_valid = data.y_valid_gpu[indices] cX_valid = data.X_valid_gpu[indices] pred_bias = critic(actor.trunc_forward(cX_valid)) bceloss = actor_loss_fn(actor(cX_valid)[:, 0], cy_valid) # loss = lam*abs(pred_bias) + (1-lam)*loss objloss = max( 1, config['adversarial']['lambda'] * (abs(pred_bias[0][0]) - config['objective']['epsilon'] + config['adversarial']['margin']) + 1) * bceloss objloss.backward() train_loss = objloss.item() actor_optimizer.step() if (epoch % 10 == 0) and (step % 100 == 0): logger.info( f'=======> Actor Epoch: {(epoch, step)} loss: {train_loss}' ) if epoch % 10 == 0: with torch.no_grad(): scores = actor(data.X_valid_gpu)[:, 0].reshape(-1, 1).cpu().numpy() _, best_adv_obj = get_best_thresh( scores, np.linspace(0, 1, 1001), data, config, valid=False, margin=config['adversarial']['margin']) logger.info(f'Objective: {best_adv_obj}') logger.info('Finding optimal threshold for Adversarial model.') with torch.no_grad(): scores = actor(data.X_valid_gpu)[:, 0].reshape(-1, 1).cpu().numpy() best_adv_thresh, _ = get_best_thresh( scores, np.linspace(0, 1, 1001), data, config, valid=False, margin=config['adversarial']['margin']) logger.info('Evaluating Adversarial model on best threshold.') with torch.no_grad(): labels = (actor(data.X_valid_gpu)[:, 0] > best_adv_thresh).reshape( -1, 1).cpu().numpy() results_valid = get_valid_objective(labels, data, config) logger.info(f'Results: {results_valid}') with torch.no_grad(): labels = (actor(data.X_test_gpu)[:, 0] > best_adv_thresh).reshape( -1, 1).cpu().numpy() results_test = get_test_objective(labels, data, config) return results_valid, results_test
def mitigating_debiasing(model_state_dict, data, config, device): logger.info('Training Mitigating model.') actor = load_model(data.num_features, config.get('hyperparameters', {})) actor.load_state_dict(model_state_dict) actor.to(device) critic = nn.Sequential(nn.Linear(32, 32), nn.Dropout(0.2), nn.ReLU(), nn.Linear(32, 32), nn.Dropout(0.2), nn.ReLU(), nn.Linear(32, 32), nn.Dropout(0.2), nn.ReLU(), nn.Linear(32, 2), nn.Softmax()) critic.to(device) critic_optimizer = optim.Adam(critic.parameters()) critic_loss_fn = torch.nn.BCELoss() actor_optimizer = optim.Adam(actor.parameters(), lr=config['mitigating']['lr']) actor_loss_fn = torch.nn.BCELoss() for epoch in range(config['mitigating']['epochs']): for param in critic.parameters(): param.requires_grad = True for param in actor.parameters(): param.requires_grad = False actor.eval() critic.train() for step in range(config['mitigating']['critic_steps']): critic_optimizer.zero_grad() indices = torch.randint(0, data.X_valid.size(0), (config['mitigating']['batch_size'], )) cy_valid = data.y_valid_gpu[indices] cX_valid = data.X_valid_gpu[indices] cp_valid = data.p_valid_gpu[indices] with torch.no_grad(): scores = actor(cX_valid)[:, 0].reshape(-1).cpu().numpy() res = critic(actor.trunc_forward(cX_valid)) loss = critic_loss_fn(res[:, 0], cp_valid.type(torch.float32)) loss.backward() train_loss = loss.item() critic_optimizer.step() if (epoch % 5 == 0) and (step % 100 == 0): logger.info( f'=======> Critic Epoch: {(epoch, step)} loss: {train_loss}' ) for param in critic.parameters(): param.requires_grad = False for param in actor.parameters(): param.requires_grad = True actor.train() critic.eval() for step in range(config['mitigating']['actor_steps']): actor_optimizer.zero_grad() indices = torch.randint(0, data.X_valid.size(0), (config['mitigating']['batch_size'], )) cy_valid = data.y_valid_gpu[indices] cX_valid = data.X_valid_gpu[indices] cp_valid = data.p_valid_gpu[indices] cx_predict = actor(cX_valid) loss_pred = actor_loss_fn(cx_predict[:, 0], cy_valid) cp_predict = critic(actor.trunc_forward(cX_valid)) loss_adv = critic_loss_fn(cp_predict[:, 0], cp_valid.type(torch.float32)) for param in actor.parameters(): try: lp = torch.autograd.grad(loss_pred, param, retain_graph=True)[0] la = torch.autograd.grad(loss_adv, param, retain_graph=True)[0] except RuntimeError: continue shape = la.shape lp = lp.flatten() la = la.flatten() lp_proj = (lp.T @ la) * la grad = lp - lp_proj - config['mitigating']['alpha'] * la grad = grad.reshape(shape) param.backward(grad) actor_optimizer.step() if (epoch % 5 == 0) and (step % 100 == 0): logger.info(f'=======> Actor Epoch: {(epoch, step)}') if epoch % 5 == 0: with torch.no_grad(): scores = actor(data.X_valid_gpu)[:, 0].reshape(-1, 1).cpu().numpy() _, best_mit_obj = get_best_thresh( scores, np.linspace(0, 1, 1001), data, config, valid=False, margin=config['mitigating']['margin']) logger.info(f'Objective: {best_mit_obj}') logger.info('Finding optimal threshold for Mitigating model.') with torch.no_grad(): scores = actor(data.X_valid_gpu)[:, 0].reshape(-1, 1).cpu().numpy() best_mit_thresh, _ = get_best_thresh(scores, np.linspace(0, 1, 1001), data, config, valid=False, margin=config['mitigating']['margin']) logger.info('Evaluating Mitigating model on best threshold.') with torch.no_grad(): labels = (actor(data.X_valid_gpu)[:, 0] > best_mit_thresh).reshape( -1, 1).cpu().numpy() results_valid = get_valid_objective(labels, data, config) logger.info(f'Results: {results_valid}') with torch.no_grad(): labels = (actor(data.X_test_gpu)[:, 0] > best_mit_thresh).reshape( -1, 1).cpu().numpy() results_test = get_test_objective(labels, data, config) return results_valid, results_test