def model(x_train, y_train, x_control_train, x_test, y_test, x_control_test, SV): x_train = ut.add_intercept(x_train) x_test = ut.add_intercept(x_test) apply_fairness_constraints = 1 apply_accuracy_constraint = 0 sensitive_attrs = [SV] sensitive_attrs_to_cov_thresh = {SV: 0} sep_constraint = 0 loss_function = lf._logistic_loss gamma = 0 def train_test_classifier(): w = ut.train_model(x_train, y_train, x_control_train, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, sensitive_attrs, sensitive_attrs_to_cov_thresh, gamma) train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(w, x_train, y_train, x_test, y_test, None, None) distances_boundary_test = np.dot(x_test, w) distances_boundary_train = np.dot(x_train, w) prob_test = [sigmoid(x) for x in distances_boundary_test] prob_train = [sigmoid(x) for x in distances_boundary_train] all_class_labels_assigned_test = np.sign(distances_boundary_test) correlation_dict_test = ut.get_correlations(None, None, all_class_labels_assigned_test, x_control_test, sensitive_attrs) cov_dict_test = ut.print_covariance_sensitive_attrs(None, x_test, distances_boundary_test, x_control_test, sensitive_attrs) p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0]) # return w, p_rule, test_score return prob_train, prob_test return train_test_classifier()
def train(self, X, y, x_sensitive, fairness_constraint): self.x_sensitive = {"s1": x_sensitive} self.X = ut.add_intercept(X) self.y = y if fairness_constraint==-1.0: self.w = ut.train_model(self.X, self.y, self.x_sensitive, lf._logistic_loss, 0, 0, 0, ["s1"], {"s1":0}, None) else: self.w = ut.train_model(self.X, self.y, self.x_sensitive, lf._logistic_loss, 1, 0, 0, ["s1"], {"s1": fairness_constraint}, None) train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(self.w, self.X, self.y, self.X, self.y, None, None) distances_boundary_test = (np.dot(self.X, self.w)).tolist() all_class_labels_assigned_test = np.sign(distances_boundary_test) correlation_dict_test = ut.get_correlations(None, None, all_class_labels_assigned_test, self.x_sensitive, ["s1"]) correlation_dict = ut.get_avg_correlation_dict([correlation_dict_test]) non_prot_pos = correlation_dict["s1"][1][1] prot_pos = correlation_dict["s1"][0][1] p_rule = (prot_pos / non_prot_pos) * 100.0 return self.w, p_rule, 100.0*test_score
def main(train_file, model_path, setting, value): x_train, y_train, x_control_train = load_json(train_file) # X = ut.add_intercept(X) # add intercept to X before applying the linear classifier x_train = ut.add_intercept(x_train) # x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, 0.7) # print >> sys.stderr, "First row:" # print >> sys.stderr, x_train[0,:], y_train[0], x_control_train if setting == 'gamma': mode = {"accuracy": 1, "gamma": float(value)} elif setting == 'c': mode = {"fairness": 1} elif setting == 'baseline': mode = {} else: raise Exception("Don't know how to handle setting %s" % setting) thresh = {} if setting == 'c': thresh = dict((k, float(value)) for (k, v) in x_control_train.items()) # print("Covariance threshold: %s" % thresh) # print("Will train classifier on %s %s-d points" % x_train.shape, file=sys.stderr) # print("Sensitive attribute: %s" % (x_control_train.keys(),), file=sys.stderr) sensitive_attrs = list(x_control_train.keys()) w = train_classifier(x_train, y_train, x_control_train, sensitive_attrs, mode, thresh) # print("Model trained successfully.", file=sys.stderr) np.save(model_path, w)
def fit_agl(self, x: Union[np.ndarray, torch.Tensor], y: Union[np.ndarray, torch.Tensor], lam: Union[float, int], max_iters: int = 1000, smooth: Union[float, int] = 0, weights: List[Union[int, float]] = None): """fits the adaptive group lasso""" if self.beta is None and weights is None: print( "Initial beta estimation is not available, please run fit or fit_gic first." ) return None if weights is None: weights = self.compute_weights(self.beta) x = remove_intercept(x) x = numpy_to_torch(x) y = numpy_to_torch(y) x = self.normalize(x) x_basis = self.basis_expansion_(x, self.df, self.degree) group_size = [self.df] * len(weights) x_basis, group_size = add_intercept(x_basis, group_size) beta_agl = self.solve(x_basis, y, lam, group_size, max_iters, weights, smooth=smooth) self.beta_agl = beta_agl self.beta = beta_agl return self
def test_synthetic_data(): """ Generate the synthetic data """ X, y, x_control = generate_synthetic_data(plot_data=False) ut.compute_p_rule(x_control["s1"], y) # compute the p-rule in the original data """ Classify the data without any constraints """ apply_fairness_constraints = 0 apply_accuracy_constraint = 0 sep_constraint = 0 loss_function = lf._logistic_loss X = ut.add_intercept(X) # add intercept to X before applying the linear classifier test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'], [{} for i in range(0,NUM_FOLDS)]) print print "== Unconstrained (original) classifier ==" ut.print_classifier_fairness_stats(test_acc_arr, correlation_dict_test_arr, cov_dict_test_arr, "s1") """ Now classify such that we achieve perfect fairness """ apply_fairness_constraints = 1 cov_factor = 0 test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'], [{'s1':cov_factor} for i in range(0,NUM_FOLDS)]) print print "== Constrained (fair) classifier ==" ut.print_classifier_fairness_stats(test_acc_arr, correlation_dict_test_arr, cov_dict_test_arr, "s1") """ Now plot a tradeoff between the fairness and accuracy """ ut.plot_cov_thresh_vs_acc_pos_ratio(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'])
def test_synthetic_data(): """ Generate the synthetic data """ print(sys.path) X, y, x_control = generate_synthetic_data(plot_data=False) ut.compute_p_rule(x_control["s1"], y) # compute the p-rule in the original data """ Classify the data without any constraints """ apply_fairness_constraints = 0 apply_accuracy_constraint = 0 sep_constraint = 0 loss_function = lf._logistic_loss X = ut.add_intercept(X) # add intercept to X before applying the linear classifier test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error( X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'], [{} for i in range(0, NUM_FOLDS)]) print print "== Unconstrained (original) classifier ==" ut.print_classifier_fairness_stats(test_acc_arr, correlation_dict_test_arr, cov_dict_test_arr, "s1") """ Now classify such that we achieve perfect fairness """ apply_fairness_constraints = 1 cov_factor = 0 test_acc_arr, train_acc_arr, correlation_dict_test_arr, correlation_dict_train_arr, cov_dict_test_arr, cov_dict_train_arr = ut.compute_cross_validation_error( X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'], [{'s1': cov_factor} for i in range(0, NUM_FOLDS)]) print print "== Constrained (fair) classifier ==" ut.print_classifier_fairness_stats(test_acc_arr, correlation_dict_test_arr, cov_dict_test_arr, "s1") """ Now plot a tradeoff between the fairness and accuracy """ ut.plot_cov_thresh_vs_acc_pos_ratio(X, y, x_control, NUM_FOLDS, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, ['s1'])
def train(self, X, y, x_sensitive, fairness_constraint): self.x_sensitive = {"s1": x_sensitive} self.X = ut.add_intercept(X) self.y = y if fairness_constraint == -1.0: self.w = ut.train_model(self.X, self.y, self.x_sensitive, lf._logistic_loss, 0, 0, 0, ["s1"], {"s1": 0}, None) else: self.w = ut.train_model(self.X, self.y, self.x_sensitive, lf._logistic_loss, 1, 0, 0, ["s1"], {"s1": fairness_constraint}, None) train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy( self.w, self.X, self.y, self.X, self.y, None, None) distances_boundary_test = (np.dot(self.X, self.w)).tolist() all_class_labels_assigned_test = np.sign(distances_boundary_test) correlation_dict_test = ut.get_correlations( None, None, all_class_labels_assigned_test, self.x_sensitive, ["s1"]) correlation_dict = ut.get_avg_correlation_dict([correlation_dict_test]) non_prot_pos = correlation_dict["s1"][1][1] prot_pos = correlation_dict["s1"][0][1] p_rule = (prot_pos / non_prot_pos) * 100.0 return self.w, p_rule, 100.0 * test_score
def main(train_file, test_file, output_file, setting): x_train, y_train, x_control_train = load_json(train_file) x_test, y_test, x_control_test = load_json(test_file) # X = ut.add_intercept(X) # add intercept to X before applying the linear classifier x_train = ut.add_intercept(x_train) x_test = ut.add_intercept(x_test) # x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, 0.7) # print >> sys.stderr, "First row:" # print >> sys.stderr, x_train[0,:], y_train[0], x_control_train sensitive_attrs = list(x_control_train.keys()) sensitive_attr = str(sensitive_attrs[0]) tau = 5.0 mu = 1.2 sensitive_attrs_to_cov_thresh = {sensitive_attr: {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}, 2: {0: 0, 1: 0}}} cons_params = {"tau": tau, "mu": mu, "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh} if setting == 'fnr': cons_type = 2 cons_params["cons_type"] = cons_type elif setting == 'fprfnr': cons_type = 4 cons_params["cons_type"] = cons_type elif setting == 'baseline': cons_params = None else: raise Exception("Don't know how to handle setting %s" % setting) # print("Will train classifier on %s %s-d points" % x_train.shape, file=sys.stderr) # print("Sensitive attribute: %s" % (x_control_train.keys(),), file=sys.stderr) sensitive_attrs = list(x_control_train.keys()) w = train_classifier(x_train, y_train, x_control_train, cons_params) # print("Model trained successfully.", file=sys.stderr) predictions = predict(w, x_test).tolist() output_file = open(output_file, "w") json.dump(predictions, output_file) output_file.close()
def main(train_file, model_path, mode, tau="5.0", mu="1.2", eps="0.0001"): """ Args: cons_type: 0 for all misclassifications 1 for FPR 2 for FNR 4 for both FPR and FNR tau: DCCP parameter, controls how much weight to put on the constraints, if the constraints are not satisfied, then increase tau -- default is DCCP val 0.005 mu: DCCP parameter, controls the multiplicative factor by which the tau increases in each DCCP iteration -- default is the DCCP val 1.2 eps: stopping criteria for the convex solver. check the CVXPY documentation for details. default for CVXPY is 1e-6 """ x_train, y_train, x_control_train = load_json(train_file) # X = ut.add_intercept(X) # add intercept to X before applying the linear classifier x_train = ut.add_intercept(x_train) # x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, 0.7) # print >> sys.stderr, "First row:" # print >> sys.stderr, x_train[0,:], y_train[0], x_control_train sensitive_attrs = list(x_control_train.keys()) sensitive_attr = str(sensitive_attrs[0]) sensitive_attrs_to_cov_thresh = {sensitive_attr: {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}, 2: {0: 0, 1: 0}}} cons_params = {"tau": float(tau), "mu": float(mu), "sensitive_attrs_to_cov_thresh": sensitive_attrs_to_cov_thresh} if mode == 'fpr': cons_type = 1 cons_params["cons_type"] = cons_type elif mode == 'fnr': cons_type = 2 cons_params["cons_type"] = cons_type elif mode == 'fprfnr': cons_type = 4 cons_params["cons_type"] = cons_type elif mode == 'baseline': cons_params = None else: raise Exception("Don't know how to handle setting %s" % mode) # print("Will train classifier on %s %s-d points" % x_train.shape, file=sys.stderr) # print("Sensitive attribute: %s" % (x_control_train.keys(),), file=sys.stderr) sensitive_attrs = list(x_control_train.keys()) w = train_classifier(x_train, y_train, x_control_train, cons_params, float(eps)) # print("Model trained successfully.", file=sys.stderr) np.save(model_path, w)
def main(test_file, model_path, output_file): x_test, y_test, x_control_test = load_json(test_file) # X = ut.add_intercept(X) # add intercept to X before applying the linear classifier x_test = ut.add_intercept(x_test) w = np.load(model_path) predictions = predict(w, x_test).tolist() output_file = open(output_file, "w") json.dump(predictions, output_file) output_file.close()
def predict(self, x): """Make predictions on data x Args: x: Inputs of shape (m,n) Returns: Outputs of shape (m,) """ sigmoid = lambda z: 1 / (1 + np.exp(-z)) x = utils.add_intercept(x) probs = sigmoid(np.einsum("ij,j->i", x, self.theta)) preds = (probs >= 0.5).astype(np.int) return preds
def fit_2(self, x: Union[np.ndarray, torch.Tensor], y: Union[np.ndarray, torch.Tensor], num_lams: int, max_iters: int = 1000, an: Union[int, float] = None, smooth: Union[float, int] = 0): """fit group lasso then followed by adaptive group lasso, saves time for basis expansion""" x = numpy_to_torch(x) y = numpy_to_torch(y) x = remove_intercept(x) x = self.normalize(x) x_basis = self.basis_expansion_(x, self.df, self.degree) group_size = [self.df] * x.shape[1] x_basis, group_size = add_intercept(x_basis, group_size) result = self.fit_path(x_basis, y, group_size, num_lams, max_iters, smooth=smooth) beta_gl = result[min(list(result.keys()))] weights = self.compute_weights(beta_gl) result = self.fit_path(x_basis, y, group_size, num_lams, max_iters, smooth=smooth, weights=weights) best_gic = np.inf best_lam = 0 best_beta = None if an is None: an = np.log(x.shape[1]) / x.shape[0] for lam in result.keys(): beta_full = result[lam] gic = self.compute_gic(x_basis, y, beta_full, an, group_size) print(f"lam:{lam}, gic:{gic}") if gic < best_gic: best_lam = lam best_beta = beta_full best_gic = gic self.beta_agl_gic = best_beta self.beta = best_beta num_nz, nz = compute_nonzeros(best_beta, group_size) print( f"The best lam {best_lam} and the best gic {best_gic}. Finally selected {num_nz - 1} nonzeros: {nz}" ) return self
def predict(self, x: Union[np.ndarray, torch.Tensor]): """predicts x""" x = numpy_to_torch(x) x = remove_intercept(x) x = self.normalize_test(x) x_basis = self.basis_expansion_(x, self.df, self.degree) x_basis = add_intercept(x_basis) eta = torch.matmul(x_basis, self.beta) if self.data_class == 'regression': return eta elif self.data_class == 'classification': return torch.where( sigmoid(eta) > 0.5, torch.ones(len(eta)), torch.zeros(len(eta))) elif self.data_class == 'gamma': return torch.exp(-eta) else: return torch.round(torch.exp(eta))
def fit_gic(self, x: Union[np.ndarray, torch.Tensor], y: Union[np.ndarray, torch.Tensor], num_lams: int, max_iters: int = 1000, an: Union[int, float] = None, smooth: Union[int, float] = 0): """fits the group lasso with gic""" x = numpy_to_torch(x) y = numpy_to_torch(y) x = remove_intercept(x) x = self.normalize(x) x_basis = self.basis_expansion_(x, self.df, self.degree) group_size = [self.df] * x.shape[1] x_basis, group_size = add_intercept(x_basis, group_size) result = self.fit_path(x_basis, y, group_size, num_lams, max_iters, smooth=smooth) best_gic = np.inf if an is None: an = self.df * np.log(np.log(x.shape[0])) * np.log( x.shape[1]) / x.shape[0] for lam in result.keys(): gic = self.compute_gic(x_basis, y, result[lam], an, group_size) # print(f"lam:{lam}, gic:{gic}") if gic < best_gic: best_lam = lam best_beta = result[lam] best_gic = gic self.beta_gic = best_beta self.beta = best_beta print(f"The best lam {best_lam} and the best gic {best_gic}.") return self
def load_compas_data(): FEATURES_CLASSIFICATION = [ "age_cat", "race", "sex", "priors_count", "c_charge_degree" ] #features to be used for classification CONT_VARIABLES = [ "priors_count" ] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot CLASS_FEATURE = "two_year_recid" # the decision variable SENSITIVE_ATTRS = ["race", "sex"] COMPAS_INPUT_FILE = "compas-scores-two-years.csv" check_data_file(COMPAS_INPUT_FILE) # load the data and get some stats df = pd.read_csv(COMPAS_INPUT_FILE) # convert to np array data = df.to_dict('list') for k in data.keys(): data[k] = np.array(data[k]) """ Filtering the data """ # These filters are the same as propublica (refer to https://github.com/propublica/compas-analysis) # If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense. idx = np.logical_and(data["days_b_screening_arrest"] <= 30, data["days_b_screening_arrest"] >= -30) # We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all. idx = np.logical_and(idx, data["is_recid"] != -1) # In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed (only two of them). idx = np.logical_and( idx, data["c_charge_degree"] != "O") # F: felony, M: misconduct # We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility. idx = np.logical_and(idx, data["score_text"] != "NA") # we will only consider blacks and whites for this analysis idx = np.logical_and( idx, np.logical_or(data["race"] == "African-American", data["race"] == "Caucasian")) # select the examples that satisfy this criteria for k in data.keys(): data[k] = data[k][idx] """ Feature normalization and one hot encoding """ # convert class label 0 to -1 y = data[CLASS_FEATURE] y[y == 0] = -1 print "\nNumber of people recidivating within two years" print pd.Series(y).value_counts() print "\n" X = np.array([]).reshape( len(y), 0 ) # empty array with num rows same as num examples, will hstack the features to it x_control = defaultdict(list) feature_names = [] for attr in FEATURES_CLASSIFICATION: vals = data[attr] if attr in CONT_VARIABLES: vals = [float(v) for v in vals] vals = preprocessing.scale(vals) # 0 mean and 1 variance vals = np.reshape( vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col else: # for binary categorical variables, the label binarizer uses just one var instead of two lb = preprocessing.LabelBinarizer() lb.fit(vals) vals = lb.transform(vals) # add to sensitive features dict if attr in SENSITIVE_ATTRS: x_control[attr] = vals # add to learnable features X = np.hstack((X, vals)) if attr in CONT_VARIABLES: # continuous feature, just append the name feature_names.append(attr) else: # categorical features if vals.shape[ 1] == 1: # binary features that passed through lib binarizer feature_names.append(attr) else: for k in lb.classes_: # non-binary categorical features, need to add the names for each cat feature_names.append(attr + "_" + str(k)) # convert the sensitive feature to 1-d array x_control = dict(x_control) for k in x_control.keys(): assert ( x_control[k].shape[1] == 1 ) # make sure that the sensitive feature is binary after one hot encoding x_control[k] = np.array(x_control[k]).flatten() # sys.exit(1) """permute the date randomly""" perm = range(0, X.shape[0]) shuffle(perm) X = X[perm] y = y[perm] for k in x_control.keys(): x_control[k] = x_control[k][perm] X = ut.add_intercept(X) feature_names = ["intercept"] + feature_names assert (len(feature_names) == X.shape[1]) print "Features we will be using for classification are:", feature_names, "\n" return X, y, x_control
print X_train.shape, y_train.shape, X_test.shape, y_test.shape # print X_train[0:3,0:3] # standardize data X_train, scaler = standardize(X_train) X_test, _ = standardize(X_test, scaler) # one dimension at a time y_train = y_train[:,0] y_test = y_test[:,0] print X_train.shape, y_train.shape, X_test.shape, y_test.shape tst_song = len(song_id_tst) # add column of ones to data to account for the bias: X_train = add_intercept(X_train) print X_train.shape # print X_train[0:10] data = dict(x=X_train, y=y_train) with Model() as model: # specify glm and pass in data. The resulting linear model, its likelihood and # and all its parameters are automatically added to our model. glm.glm('y ~ x', data) start = find_MAP() step = NUTS(scaling=start) # Instantiate MCMC sampling algorithm trace = sample(100, step, progressbar=False) # draw 2000 posterior samples using NUTS sampling
def load_bank_marketing_data(): FEATURES_CLASSIFICATION = ["age","job","marital","education","default","housing","loan", "contact","month","day_of_week","poutcome"] #features to be used for classification CONT_VARIABLES = [] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot CLASS_FEATURE = "y" # the decision variable SENSITIVE_ATTRS = ["age"] INPUT_FILE = "bank-additional-full.csv" # load the data and get some stats df = pd.read_csv(INPUT_FILE, sep = ";") df = df.dropna(subset=FEATURES_CLASSIFICATION) # dropping missing vals # convert to np array data = df.to_dict('list') for k in data.keys(): data[k] = np.array(data[k]) """ Filtering the data """ # data downloaded are already pro-processed """ Feature normalization and one hot encoding """ y = data[CLASS_FEATURE] y[y=="yes"] = 1 y[y=="no"] = -1 y = y.astype('int32') # convert class label 'age' to a binary value where privileged is `age >= 25` and unprivileged is `age < 25` for i in range(len(data["age"])): if int(data["age"][i]) >= 25: data["age"][i] = "privileged" elif int(data["age"][i]) < 25: data["age"][i] = "unprivileged" print("\nNumber of clients subscribed to a term deposit") print(pd.Series(y).value_counts()) print("\n") X = np.array([]).reshape(len(y), 0) # empty array with num rows same as num examples, will hstack the features to it x_control = defaultdict(list) feature_names = [] for attr in FEATURES_CLASSIFICATION: vals = data[attr] if attr in CONT_VARIABLES: vals = [float(v) for v in vals] vals = preprocessing.scale(vals) # 0 mean and 1 variance vals = np.reshape(vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col else: # for binary categorical variables, the label binarizer uses just one var instead of two lb = preprocessing.LabelBinarizer() lb.fit(vals) vals = lb.transform(vals) # add to sensitive features dict if attr in SENSITIVE_ATTRS: x_control[attr] = vals # add to learnable features X = np.hstack((X, vals)) if attr in CONT_VARIABLES: # continuous feature, just append the name feature_names.append(attr) else: # categorical features if vals.shape[1] == 1: # binary features that passed through lib binarizer feature_names.append(attr) else: for k in lb.classes_: # non-binary categorical features, need to add the names for each cat feature_names.append(attr + "_" + str(k)) # convert the sensitive feature to 1-d array x_control = dict(x_control) for k in x_control.keys(): assert(x_control[k].shape[1] == 1) # make sure that the sensitive feature is binary after one hot encoding x_control[k] = np.array(x_control[k]).flatten() # sys.exit(1) """permute the date randomly""" perm = list(range(0,X.shape[0])) shuffle(perm) X = X[perm] y = y[perm] for k in x_control.keys(): x_control[k] = x_control[k][perm] X = ut.add_intercept(X) feature_names = ["intercept"] + feature_names assert(len(feature_names) == X.shape[1]) print("Features we will be using for classification are:"+str(feature_names)+"\n") return X, y, x_control
#results = model.fit() #print(results.summary()) algo = pyhdfe.create(get_np_columns(df, ['idcode', 'year'], False), degrees_method='pairwise') residualized = algo.residualize(get_np_columns(df, ['ln_wage', 'hours_log'], False)) print(algo.degrees) import pdb; pdb.set_trace() #model = sm.OLS(residualized[:,0], np.ones((residualized.shape[0], 1))) model = sm.OLS(residualized[:,0], add_intercept(residualized[:, 1])) #print(add_intercept(residualized[:,1])[:10]) ids = get_np_columns(df, ['idcode', 'year'], False) all_group_indices = [] for i in range(ids.shape[1]): col = ids[:, i] # col_n[0] = sorted unique values # col_n[1] = unique_indices # col_n[2] = unique_inverse unique_values, standardized_ids = np.unique(col, return_inverse=True) # Okay I have a column with unique values in indices, call it a. # I want a list of lists b such that the first list contains
def test_synthetic_data(): """ Generate the synthetic data """ X, y, x_control = generate_synthetic_data(plot_data=True) # set plot_data to False to skip the data plot ut.compute_p_rule(x_control["s1"], y) # compute the p-rule in the original data """ Split the data into train and test """ X = ut.add_intercept(X) # add intercept to X before applying the linear classifier train_fold_size = 0.7 x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, train_fold_size) apply_fairness_constraints = None apply_accuracy_constraint = None sep_constraint = None loss_function = lf._logistic_loss sensitive_attrs = ["s1"] sensitive_attrs_to_cov_thresh = {} gamma = None def train_test_classifier(): w = ut.train_model(x_train, y_train, x_control_train, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, sensitive_attrs, sensitive_attrs_to_cov_thresh, gamma) train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(w, x_train, y_train, x_test, y_test, None, None) distances_boundary_test = (np.dot(x_test, w)).tolist() all_class_labels_assigned_test = np.sign(distances_boundary_test) correlation_dict_test = ut.get_correlations(None, None, all_class_labels_assigned_test, x_control_test, sensitive_attrs) cov_dict_test = ut.print_covariance_sensitive_attrs(None, x_test, distances_boundary_test, x_control_test, sensitive_attrs) p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0]) return w, p_rule, test_score def plot_boundaries(w1, w2, p1, p2, acc1, acc2, fname): num_to_draw = 200 # we will only draw a small number of points to avoid clutter x_draw = X[:num_to_draw] y_draw = y[:num_to_draw] x_control_draw = x_control["s1"][:num_to_draw] X_s_0 = x_draw[x_control_draw == 0.0] X_s_1 = x_draw[x_control_draw == 1.0] y_s_0 = y_draw[x_control_draw == 0.0] y_s_1 = y_draw[x_control_draw == 1.0] plt.scatter(X_s_0[y_s_0==1.0][:, 1], X_s_0[y_s_0==1.0][:, 2], color='green', marker='x', s=30, linewidth=1.5) plt.scatter(X_s_0[y_s_0==-1.0][:, 1], X_s_0[y_s_0==-1.0][:, 2], color='red', marker='x', s=30, linewidth=1.5) plt.scatter(X_s_1[y_s_1==1.0][:, 1], X_s_1[y_s_1==1.0][:, 2], color='green', marker='o', facecolors='none', s=30) plt.scatter(X_s_1[y_s_1==-1.0][:, 1], X_s_1[y_s_1==-1.0][:, 2], color='red', marker='o', facecolors='none', s=30) x1,x2 = max(x_draw[:,1]), min(x_draw[:,1]) y1,y2 = ut.get_line_coordinates(w1, x1, x2) plt.plot([x1,x2], [y1,y2], 'c-', linewidth=3, label = "Acc=%0.2f; p%% rule=%0.0f%% - Original"%(acc1, p1)) y1,y2 = ut.get_line_coordinates(w2, x1, x2) plt.plot([x1,x2], [y1,y2], 'b--', linewidth=3, label = "Acc=%0.2f; p%% rule=%0.0f%% - Constrained"%(acc2, p2)) plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # dont need the ticks to see the data distribution plt.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') plt.legend(loc=2, fontsize=15) plt.xlim((-15,10)) plt.ylim((-10,15)) plt.savefig(fname) plt.show() """ Classify the data while optimizing for accuracy """ print print "== Unconstrained (original) classifier ==" # all constraint flags are set to 0 since we want to train an unconstrained (original) classifier apply_fairness_constraints = 0 apply_accuracy_constraint = 0 sep_constraint = 0 w_uncons, p_uncons, acc_uncons = train_test_classifier() """ Now classify such that we optimize for accuracy while achieving perfect fairness """ apply_fairness_constraints = 1 # set this flag to one since we want to optimize accuracy subject to fairness constraints apply_accuracy_constraint = 0 sep_constraint = 0 sensitive_attrs_to_cov_thresh = {"s1":0} print print "== Classifier with fairness constraint ==" w_f_cons, p_f_cons, acc_f_cons = train_test_classifier() plot_boundaries(w_uncons, w_f_cons, p_uncons, p_f_cons, acc_uncons, acc_f_cons, "img/f_cons.png") """ Classify such that we optimize for fairness subject to a certain loss in accuracy """ apply_fairness_constraints = 0 # flag for fairness constraint is set back to0 since we want to apply the accuracy constraint now apply_accuracy_constraint = 1 # now, we want to optimize fairness subject to accuracy constraints sep_constraint = 0 gamma = 0.5 # gamma controls how much loss in accuracy we are willing to incur to achieve fairness -- increase gamme to allow more loss in accuracy print "== Classifier with accuracy constraint ==" w_a_cons, p_a_cons, acc_a_cons = train_test_classifier() plot_boundaries(w_uncons, w_a_cons, p_uncons, p_a_cons, acc_uncons, acc_a_cons, "img/a_cons.png") """ Classify such that we optimize for fairness subject to a certain loss in accuracy In addition, make sure that no points classified as positive by the unconstrained (original) classifier are misclassified. """ apply_fairness_constraints = 0 # flag for fairness constraint is set back to0 since we want to apply the accuracy constraint now apply_accuracy_constraint = 1 # now, we want to optimize accuracy subject to fairness constraints sep_constraint = 1 # set the separate constraint flag to one, since in addition to accuracy constrains, we also want no misclassifications for certain points (details in demo README.md) gamma = 2000.0 print "== Classifier with accuracy constraint (no +ve misclassification) ==" w_a_cons_fine, p_a_cons_fine, acc_a_cons_fine = train_test_classifier() plot_boundaries(w_uncons, w_a_cons_fine, p_uncons, p_a_cons_fine, acc_uncons, acc_a_cons_fine, "img/a_cons_fine.png") return
def load_meps_data(): # TO DO: CHANGE THIS FEATURES_CLASSIFICATION = ['REGION','AGE','SEX','RACE','MARRY', 'FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX', 'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX', 'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM', 'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42', 'PCS42', 'MCS42','K6SUM42','PHQ242','EMPST','POVCAT','INSCOV', 'PERWT16F'] #features to be used for classification CONT_VARIABLES = ['AGE','PCS42','MCS42','K6SUM42', 'PERWT16F'] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot CLASS_FEATURE = 'UTILIZATION' # the decision variable SENSITIVE_ATTRS = ['RACE'] INPUT_FILE = "h192.csv" # load the data and get some stats df = pd.read_csv(INPUT_FILE) """ Filtering the data """ df['RACEV2X'] = df.apply(lambda row: race(row), axis=1) df = df.rename(columns = {'RACEV2X' : 'RACE'}) df = df[df['PANEL'] == 21] df = df.rename(columns = {'FTSTU53X' : 'FTSTU', 'ACTDTY53' : 'ACTDTY', 'HONRDC53' : 'HONRDC', 'RTHLTH53' : 'RTHLTH', 'MNHLTH53' : 'MNHLTH', 'CHBRON53' : 'CHBRON', 'JTPAIN53' : 'JTPAIN', 'PREGNT53' : 'PREGNT', 'WLKLIM53' : 'WLKLIM', 'ACTLIM53' : 'ACTLIM', 'SOCLIM53' : 'SOCLIM', 'COGLIM53' : 'COGLIM', 'EMPST53' : 'EMPST', 'REGION53' : 'REGION', 'MARRY53X' : 'MARRY', 'AGE53X' : 'AGE', 'POVCAT16' : 'POVCAT', 'INSCOV16' : 'INSCOV'}) df = df[df['REGION'] >= 0] # remove values -1 df = df[df['AGE'] >= 0] # remove values -1 df = df[df['MARRY'] >= 0] # remove values -1, -7, -8, -9 df = df[df['ASTHDX'] >= 0] # remove values -1, -7, -8, -9 df = df[(df[['FTSTU','ACTDTY','HONRDC','RTHLTH','MNHLTH','HIBPDX','CHDDX','ANGIDX','EDUCYR','HIDEG', 'MIDX','OHRTDX','STRKDX','EMPHDX','CHBRON','CHOLDX','CANCERDX','DIABDX', 'JTPAIN','ARTHDX','ARTHTYPE','ASTHDX','ADHDADDX','PREGNT','WLKLIM', 'ACTLIM','SOCLIM','COGLIM','DFHEAR42','DFSEE42','ADSMOK42', 'PHQ242','EMPST','POVCAT','INSCOV']] >= -1).all(1)] #for all other categorical features, remove values < -1 df['TOTEXP16'] = df.apply(lambda row: utilization(row), axis=1) lessE = df['TOTEXP16'] < 10.0 df.loc[lessE,'TOTEXP16'] = -1.0 moreE = df['TOTEXP16'] >= 10.0 df.loc[moreE,'TOTEXP16'] = 1.0 df = df.rename(columns = {'TOTEXP16' : 'UTILIZATION'}) df = df.dropna(subset=FEATURES_CLASSIFICATION) # dropping missing vals # convert to np array data = df.to_dict('list') for k in data.keys(): data[k] = np.array(data[k]) """ Feature normalization and one hot encoding """ y = data[CLASS_FEATURE] print("\nNumber of clients subscribed to a term deposit") print(pd.Series(y).value_counts()) print("\n") X = np.array([]).reshape(len(y), 0) # empty array with num rows same as num examples, will hstack the features to it x_control = defaultdict(list) feature_names = [] for attr in FEATURES_CLASSIFICATION: vals = data[attr] if attr in CONT_VARIABLES: vals = [float(v) for v in vals] vals = preprocessing.scale(vals) # 0 mean and 1 variance vals = np.reshape(vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col else: # for binary categorical variables, the label binarizer uses just one var instead of two lb = preprocessing.LabelBinarizer() lb.fit(vals) vals = lb.transform(vals) # add to sensitive features dict if attr in SENSITIVE_ATTRS: x_control[attr] = vals # add to learnable features X = np.hstack((X, vals)) if attr in CONT_VARIABLES: # continuous feature, just append the name feature_names.append(attr) else: # categorical features if vals.shape[1] == 1: # binary features that passed through lib binarizer feature_names.append(attr) else: for k in lb.classes_: # non-binary categorical features, need to add the names for each cat feature_names.append(attr + "_" + str(k)) # convert the sensitive feature to 1-d array x_control = dict(x_control) for k in x_control.keys(): assert(x_control[k].shape[1] == 1) # make sure that the sensitive feature is binary after one hot encoding x_control[k] = np.array(x_control[k]).flatten() # sys.exit(1) """permute the date randomly""" perm = list(range(0,X.shape[0])) shuffle(perm) X = X[perm] y = y[perm] for k in x_control.keys(): x_control[k] = x_control[k][perm] X = ut.add_intercept(X) feature_names = ["intercept"] + feature_names assert(len(feature_names) == X.shape[1]) print("Features we will be using for classification are:"+str(feature_names)+"\n") return X, y, x_control
def load_compas_data(): FEATURES_CLASSIFICATION = ["age_cat", "race", "sex", "priors_count", "c_charge_degree"] #features to be used for classification CONT_VARIABLES = ["priors_count"] # continuous features, will need to be handled separately from categorical features, categorical features will be encoded using one-hot CLASS_FEATURE = "two_year_recid" # the decision variable SENSITIVE_ATTRS = ["race"] COMPAS_INPUT_FILE = "compas-scores-two-years.csv" check_data_file(COMPAS_INPUT_FILE) # load the data and get some stats df = pd.read_csv(COMPAS_INPUT_FILE) df = df.dropna(subset=["days_b_screening_arrest"]) # dropping missing vals # convert to np array data = df.to_dict('list') for k in data.keys(): data[k] = np.array(data[k]) """ Filtering the data """ # These filters are the same as propublica (refer to https://github.com/propublica/compas-analysis) # If the charge date of a defendants Compas scored crime was not within 30 days from when the person was arrested, we assume that because of data quality reasons, that we do not have the right offense. idx = np.logical_and(data["days_b_screening_arrest"]<=30, data["days_b_screening_arrest"]>=-30) # We coded the recidivist flag -- is_recid -- to be -1 if we could not find a compas case at all. idx = np.logical_and(idx, data["is_recid"] != -1) # In a similar vein, ordinary traffic offenses -- those with a c_charge_degree of 'O' -- will not result in Jail time are removed (only two of them). idx = np.logical_and(idx, data["c_charge_degree"] != "O") # F: felony, M: misconduct # We filtered the underlying data from Broward county to include only those rows representing people who had either recidivated in two years, or had at least two years outside of a correctional facility. idx = np.logical_and(idx, data["score_text"] != "NA") # we will only consider blacks and whites for this analysis idx = np.logical_and(idx, np.logical_or(data["race"] == "African-American", data["race"] == "Caucasian")) # select the examples that satisfy this criteria for k in data.keys(): data[k] = data[k][idx] """ Feature normalization and one hot encoding """ # convert class label 0 to -1 y = data[CLASS_FEATURE] y[y==0] = -1 print "\nNumber of people recidivating within two years" print pd.Series(y).value_counts() print "\n" X = np.array([]).reshape(len(y), 0) # empty array with num rows same as num examples, will hstack the features to it x_control = defaultdict(list) feature_names = [] for attr in FEATURES_CLASSIFICATION: vals = data[attr] if attr in CONT_VARIABLES: vals = [float(v) for v in vals] vals = preprocessing.scale(vals) # 0 mean and 1 variance vals = np.reshape(vals, (len(y), -1)) # convert from 1-d arr to a 2-d arr with one col else: # for binary categorical variables, the label binarizer uses just one var instead of two lb = preprocessing.LabelBinarizer() lb.fit(vals) vals = lb.transform(vals) # add to sensitive features dict if attr in SENSITIVE_ATTRS: x_control[attr] = vals # add to learnable features X = np.hstack((X, vals)) if attr in CONT_VARIABLES: # continuous feature, just append the name feature_names.append(attr) else: # categorical features if vals.shape[1] == 1: # binary features that passed through lib binarizer feature_names.append(attr) else: for k in lb.classes_: # non-binary categorical features, need to add the names for each cat feature_names.append(attr + "_" + str(k)) # convert the sensitive feature to 1-d array x_control = dict(x_control) for k in x_control.keys(): assert(x_control[k].shape[1] == 1) # make sure that the sensitive feature is binary after one hot encoding x_control[k] = np.array(x_control[k]).flatten() # sys.exit(1) """permute the date randomly""" perm = range(0,X.shape[0]) shuffle(perm) X = X[perm] y = y[perm] for k in x_control.keys(): x_control[k] = x_control[k][perm] X = ut.add_intercept(X) feature_names = ["intercept"] + feature_names assert(len(feature_names) == X.shape[1]) print "Features we will be using for classification are:", feature_names, "\n" return X, y, x_control
def test_adult_data(): """ Load the adult data """ X, y, x_control = load_adult_data( load_data_size=None ) # set the argument to none, or no arguments if you want to test with the whole data -- we are subsampling for performance speedup ut.compute_p_rule(x_control["sex"], y) # compute the p-rule in the original data """ Split the data into train and test """ X = ut.add_intercept( X) # add intercept to X before applying the linear classifier train_fold_size = 0.7 x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test( X, y, x_control, train_fold_size) apply_fairness_constraints = None apply_accuracy_constraint = None sep_constraint = None loss_function = lf._logistic_loss sensitive_attrs = ["sex"] sensitive_attrs_to_cov_thresh = {} gamma = None def train_test_classifier(): w = ut.train_model(x_train, y_train, x_control_train, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, sensitive_attrs, sensitive_attrs_to_cov_thresh, gamma) train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy( w, x_train, y_train, x_test, y_test, None, None) distances_boundary_test = (np.dot(x_test, w)).tolist() all_class_labels_assigned_test = np.sign(distances_boundary_test) correlation_dict_test = ut.get_correlations( None, None, all_class_labels_assigned_test, x_control_test, sensitive_attrs) cov_dict_test = ut.print_covariance_sensitive_attrs( None, x_test, distances_boundary_test, x_control_test, sensitive_attrs) p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0]) eq_op_acc, chance_bin_zero, chance_bin_one = ut.get_eq_op_acc( w, x_train, y_train, x_control_train, None) eq_odds_acc = ut.get_eq_odds_acc(w, x_train, y_train, x_control_train, None) pred_rate_par_acc = ut.get_pred_rate_par_acc(w, x_train, y_train, x_control_train, None) demo_par_acc_f_cons = ut.get_dem_par_acc(w, x_train, y_train, x_control_train, None) return w, p_rule, test_score, eq_op_acc, eq_odds_acc, pred_rate_par_acc, demo_par_acc_f_cons """ Classify the data while optimizing for accuracy """ print() print("== Unconstrained (original) classifier ==") # all constraint flags are set to 0 since we want to train an unconstrained (original) classifier apply_fairness_constraints = 0 apply_accuracy_constraint = 0 sep_constraint = 0 w_uncons, p_uncons, acc_uncons, eq_op_acc_uncons, eq_odds_acc_uncons, pred_rate_par_acc_uncons, demo_par_acc_uncons = train_test_classifier( ) temp_eq_op_acc_f = [] temp_eq_odds_acc_f = [] temp_pred_rate_par_acc_f = [] temp_demo_par_acc_f = [] """ Now classify such that we optimize for accuracy while achieving perfect fairness """ apply_fairness_constraints = 1 # set this flag to one since we want to optimize accuracy subject to fairness constraints apply_accuracy_constraint = 0 sep_constraint = 0 for num in np.arange(0, 0.51, 0.1): sensitive_attrs_to_cov_thresh = {"sex": num} print() print("== Classifier with fairness constraint, cov: ", num, " ==") w_f_cons, p_f_cons, acc_f_cons, eq_op_acc_f_cons, eq_odds_acc_f_cons, pred_rate_par_acc_f_cons, demo_par_acc_f_cons = train_test_classifier( ) temp_eq_op_acc_f.append(eq_op_acc_f_cons) temp_eq_odds_acc_f.append(eq_odds_acc_f_cons) temp_pred_rate_par_acc_f.append(pred_rate_par_acc_f_cons) temp_demo_par_acc_f.append(demo_par_acc_f_cons) sensitive_attrs_to_cov_thresh = {"sex": 1} print() print("== Classifier with fairness constraint, cov: 1 ==") w_f_cons, p_f_cons, acc_f_cons, eq_op_acc_f_cons, eq_odds_acc_f_cons, pred_rate_par_acc_f_cons, demo_par_acc_f_cons = train_test_classifier( ) temp_eq_op_acc_f.append(eq_op_acc_f_cons) temp_eq_odds_acc_f.append(eq_odds_acc_f_cons) temp_pred_rate_par_acc_f.append(pred_rate_par_acc_f_cons) temp_demo_par_acc_f.append(demo_par_acc_f_cons) return eq_op_acc_uncons, eq_odds_acc_uncons, pred_rate_par_acc_uncons, demo_par_acc_uncons, temp_eq_op_acc_f, temp_eq_odds_acc_f, temp_pred_rate_par_acc_f, temp_demo_par_acc_f
def generate_synthetic_data(data_type, plot_data=False): """ Code for generating the synthetic data. We will have two non-sensitive features and one sensitive feature. Non sensitive features will be drawn from a 2D gaussian distribution. Sensitive feature specifies the demographic group of the data point and can take values 0 and 1. The code will generate data such that a classifier optimizing for accuracy will lead to disparate misclassification rates for the two demographic groups. You can generate different data configurations using different values for the "data_type" parameter. """ n_samples = 1000 # generate these many data points per cluster def gen_gaussian_diff_size(mean_in, cov_in, z_val, class_label, n): """ mean_in: mean of the gaussian cluster cov_in: covariance matrix z_val: sensitive feature value class_label: +1 or -1 n: number of points """ nv = multivariate_normal(mean = mean_in, cov = cov_in) X = nv.rvs(n) y = np.ones(n, dtype=float) * class_label z = np.ones(n, dtype=float) * z_val # all the points in this cluster get this value of the sensitive attribute return nv, X, y, z if data_type == 1: """ Generate data such that a classifier optimizing for accuracy will have disparate false positive rates as well as disparate false negative rates for both groups. """ cc = [[10,1], [1,4]] mu1, sigma1 = [2, 3], cc # z=1, + cc = [[5,2], [2,5]] mu2, sigma2 = [1, 2], cc # z=0, + cc = [[5, 1], [1, 5]] mu3, sigma3 = [-5,0], cc # z=1, - cc = [[7, 1], [1, 7]] mu4, sigma4 = [0,-1], cc # z=0, - nv1, X1, y1, z1 = gen_gaussian_diff_size(mu1, sigma1, 1, +1, int(n_samples * 1) ) # z=1, + nv2, X2, y2, z2 = gen_gaussian_diff_size(mu2, sigma2, 0, +1, int(n_samples * 1) ) # z=0, + nv3, X3, y3, z3 = gen_gaussian_diff_size(mu3, sigma3, 1, -1, int(n_samples * 1) ) # z=1, - nv4, X4, y4, z4 = gen_gaussian_diff_size(mu4, sigma4, 0, -1, int(n_samples * 1) ) # z=0, - elif data_type == 2: """ Generate data such that a classifier optimizing for accuracy will have disparate false positive rates for both groups but will have equal false negative rates. """ cc = [[3,1], [1,3]] mu1, sigma1 = [2, 2], cc # z=1, + mu2, sigma2 = [2, 2], cc # z=0, + mu3, sigma3 = [-2,-2], cc # z=1, - cc = [[3,3], [1,3]] mu4, sigma4 = [-1,0], cc # z=0, - nv1, X1, y1, z1 = gen_gaussian_diff_size(mu1, sigma1, 1, +1, int(n_samples * 1) ) # z=1, + nv2, X2, y2, z2 = gen_gaussian_diff_size(mu2, sigma2, 0, +1, int(n_samples * 1) ) # z=0, + nv3, X3, y3, z3 = gen_gaussian_diff_size(mu3, sigma3, 1, -1, int(n_samples * 1) ) # z=1, - nv4, X4, y4, z4 = gen_gaussian_diff_size(mu4, sigma4, 0, -1, int(n_samples * 1) ) # z=0, - # merge the clusters X = np.vstack((X1, X2, X3, X4)) y = np.hstack((y1, y2, y3, y4)) x_control = np.hstack((z1, z2, z3, z4)) # shuffle the data perm = list(range(len(X))) shuffle(perm) X = X[perm] y = y[perm] x_control = x_control[perm] """ Plot the data """ if plot_data: plt.figure() num_to_draw = 200 # we will only draw a small number of points to avoid clutter x_draw = X[:num_to_draw] y_draw = y[:num_to_draw] x_control_draw = x_control[:num_to_draw] X_s_0 = x_draw[x_control_draw == 0.0] X_s_1 = x_draw[x_control_draw == 1.0] y_s_0 = y_draw[x_control_draw == 0.0] y_s_1 = y_draw[x_control_draw == 1.0] plt.scatter(X_s_0[y_s_0==1.0][:, 0], X_s_0[y_s_0==1.0][:, 1], color='green', marker='x', s=60, linewidth=2, label= "group-0 +ve") plt.scatter(X_s_0[y_s_0==-1.0][:, 0], X_s_0[y_s_0==-1.0][:, 1], color='red', marker='x', s=60, linewidth=2, label = "group-0 -ve") plt.scatter(X_s_1[y_s_1==1.0][:, 0], X_s_1[y_s_1==1.0][:, 1], color='green', marker='o', facecolors='none', s=60, linewidth=2, label = "group-1 +ve") plt.scatter(X_s_1[y_s_1==-1.0][:, 0], X_s_1[y_s_1==-1.0][:, 1], color='red', marker='o', facecolors='none', s=60, linewidth=2, label = "group-1 -ve") plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # dont need the ticks to see the data distribution plt.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') plt.legend(loc=2, fontsize=21) plt.ylim((-8,12)) plt.savefig("img/data.png") plt.show() x_control = {"s1": x_control} # all the sensitive features are stored in a dictionary X = ut.add_intercept(X) return X,y,x_control
def test_adult_data(): """ Load the adult data """ X, y, x_control = load_adult_data(load_data_size=10000) # set the argument to none, or no arguments if you want to test with the whole data -- we are subsampling for performance speedup ut.compute_p_rule(x_control["sex"], y) # compute the p-rule in the original data """ Split the data into train and test """ X = ut.add_intercept(X) # add intercept to X before applying the linear classifier train_fold_size = 0.7 x_train, y_train, x_control_train, x_test, y_test, x_control_test = ut.split_into_train_test(X, y, x_control, train_fold_size) apply_fairness_constraints = None apply_accuracy_constraint = None sep_constraint = None loss_function = lf._logistic_loss sensitive_attrs = ["sex"] sensitive_attrs_to_cov_thresh = {} gamma = None def train_test_classifier(): w = ut.train_model(x_train, y_train, x_control_train, loss_function, apply_fairness_constraints, apply_accuracy_constraint, sep_constraint, sensitive_attrs, sensitive_attrs_to_cov_thresh, gamma) train_score, test_score, correct_answers_train, correct_answers_test = ut.check_accuracy(w, x_train, y_train, x_test, y_test, None, None) distances_boundary_test = (np.dot(x_test, w)).tolist() all_class_labels_assigned_test = np.sign(distances_boundary_test) correlation_dict_test = ut.get_correlations(None, None, all_class_labels_assigned_test, x_control_test, sensitive_attrs) cov_dict_test = ut.print_covariance_sensitive_attrs(None, x_test, distances_boundary_test, x_control_test, sensitive_attrs) p_rule = ut.print_classifier_fairness_stats([test_score], [correlation_dict_test], [cov_dict_test], sensitive_attrs[0]) return w, p_rule, test_score """ Classify the data while optimizing for accuracy """ print print "== Unconstrained (original) classifier ==" # all constraint flags are set to 0 since we want to train an unconstrained (original) classifier apply_fairness_constraints = 0 apply_accuracy_constraint = 0 sep_constraint = 0 w_uncons, p_uncons, acc_uncons = train_test_classifier() """ Now classify such that we optimize for accuracy while achieving perfect fairness """ apply_fairness_constraints = 1 # set this flag to one since we want to optimize accuracy subject to fairness constraints apply_accuracy_constraint = 0 sep_constraint = 0 sensitive_attrs_to_cov_thresh = {"sex":0} print print "== Classifier with fairness constraint ==" w_f_cons, p_f_cons, acc_f_cons = train_test_classifier() """ Classify such that we optimize for fairness subject to a certain loss in accuracy """ apply_fairness_constraints = 0 # flag for fairness constraint is set back to0 since we want to apply the accuracy constraint now apply_accuracy_constraint = 1 # now, we want to optimize fairness subject to accuracy constraints sep_constraint = 0 gamma = 0.5 # gamma controls how much loss in accuracy we are willing to incur to achieve fairness -- increase gamme to allow more loss in accuracy print "== Classifier with accuracy constraint ==" w_a_cons, p_a_cons, acc_a_cons = train_test_classifier() """ Classify such that we optimize for fairness subject to a certain loss in accuracy In addition, make sure that no points classified as positive by the unconstrained (original) classifier are misclassified. """ apply_fairness_constraints = 0 # flag for fairness constraint is set back to0 since we want to apply the accuracy constraint now apply_accuracy_constraint = 1 # now, we want to optimize accuracy subject to fairness constraints sep_constraint = 1 # set the separate constraint flag to one, since in addition to accuracy constrains, we also want no misclassifications for certain points (details in demo README.md) gamma = 1000.0 print "== Classifier with accuracy constraint (no +ve misclassification) ==" w_a_cons_fine, p_a_cons_fine, acc_a_cons_fine = train_test_classifier() return
# print X_train[0:3,0:3] # standardize data X_train, scaler = standardize(X_train) X_test, _ = standardize(X_test, scaler) # one dimension at a time y_train = y_train[:, 0] y_test = y_test[:, 0] print X_train.shape, y_train.shape, X_test.shape, y_test.shape tst_song = len(song_id_tst) # add column of ones to data to account for the bias: X_train = add_intercept(X_train) print X_train.shape # print X_train[0:10] data = dict(x=X_train, y=y_train) with Model() as model: # specify glm and pass in data. The resulting linear model, its likelihood and # and all its parameters are automatically added to our model. glm.glm('y ~ x', data) start = find_MAP() step = NUTS(scaling=start) # Instantiate MCMC sampling algorithm trace = sample( 100, step, progressbar=False) # draw 2000 posterior samples using NUTS sampling
def solve(self, x: Union[np.ndarray, torch.Tensor], y: Union[np.ndarray, torch.Tensor], lam: Union[float, int], group_size: Union[int, List[int]], max_iters: int = 1000, weight: List[Union[int, List[int]]] = None, smooth: Union[float, int] = 0, recompute_hg: bool = True, beta_warm: torch.Tensor = None, weight_multiplied: bool = False) -> torch.Tensor: """ fits the model with a use specified lambda :param x: the design matrix :param y: the response :param lam: the lambda for group lasso :param group_size: list of group sizes, or simple group size if all groups are of the same size :param weight: feature weights :param max_iters: the maximum number of iterations :param smooth: smoothness parameter :param recompute_hg: whether to recompute hg :param beta_warm: warm start of beta :return: coefficients """ if isinstance(group_size, int): group_size = [group_size] * (x.shape[1] // group_size) assert np.sum(group_size) == x.shape[1], \ f"Sum of group sizes {sum(group_size)} do not match number of variables {x.shape[1]}." assert lam >= 0, "Tuning parameter lam must be non-negative." """initialize parameters""" self.smoothness_penalty = smooth x = numpy_to_torch(x) y = numpy_to_torch(y) x, y = check_xy(x, y) x, group_size = add_intercept(x, group_size) if weight is None: weight = [1] * len(group_size) if not weight_multiplied: weights = [np.sqrt(group_size[i]) * weight[i] for i in range(len(group_size))] else: weights = weight[:] x1 = x.clone() # x1, self.R = self.group_orthogonalization(x, group_size) beta, error, iters, loss = self.initialize(group_size) if beta_warm is not None and beta_warm.shape == beta.shape: beta = beta_warm intercept_err = np.inf beta_old = beta.clone() num_groups = len(group_size) hg = None """start iterations""" while (error > self.tol or intercept_err > self.tol) and iters <= max_iters: iters += 1 for g in range(num_groups): group_idx_start, group_idx_end = self.find_group_index(group_size, g) if recompute_hg or hg is None or g <= 2: hg = self.compute_hg(x1, y, beta, group_idx_start, group_idx_end) derivative = self.compute_grad(x1, y, beta) if g == 0: d = self.compute_d(False, derivative, beta, lam, group_idx_start, group_idx_end, hg) alpha = self.line_search(x1, y, beta, d, group_size, g, lam) beta = beta + alpha * d else: beta[group_idx_start: group_idx_end] = self.close_form_QM(beta, derivative, hg, lam, group_idx_start, group_idx_end, weights[g], smooth) error = torch.norm(beta[1:] - beta_old[1:]) intercept_err = abs(beta[0].detach().numpy() - beta_old[0].detach().numpy()) beta_old = beta.clone() # print(f"error is {error}") # print(iters) # beta = self.group_orthogonalization_inverse(beta, self.R, group_size) return beta
# s = s + '%g '%X_train[i,feat] # infile.write('%s\n'%s) # one dimension at a time y_train = y_train[:, 0] y_test = y_test[:, 0] X_train = X_train[:, [10, 12, 13, 17, 19, 82, 83, 84, 85, 89, 90, 91, 103, 140, 142, 146, 148, 212, 214, 218, 220]] X_test = X_test[:, [10, 12, 13, 17, 19, 82, 83, 84, 85, 89, 90, 91, 103, 140, 142, 146, 148, 212, 214, 218, 220]] print X_train.shape, y_train.shape, X_test.shape, y_test.shape tst_song = len(song_id_tst) # add column of ones to data to account for the bias: X_train = add_intercept(X_train) print X_train.shape # print X_train[0:10] # Theano symbolic definitions X = T.vector() Y = T.scalar() lr = T.scalar("learning rate") regul = T.scalar("L2 regul. coeff") def model(X, w): return T.dot(X, w) nb_features = X_train.shape[1]
def generate_synthetic_data(data_type, plot_data=False): """ Code for generating the synthetic data. We will have two non-sensitive features and one sensitive feature. Non sensitive features will be drawn from a 2D gaussian distribution. Sensitive feature specifies the demographic group of the data point and can take values 0 and 1. The code will generate data such that a classifier optimizing for accuracy will lead to disparate misclassification rates for the two demographic groups. You can generate different data configurations using different values for the "data_type" parameter. """ n_samples = 1000 # generate these many data points per cluster def gen_gaussian_diff_size(mean_in, cov_in, z_val, class_label, n): """ mean_in: mean of the gaussian cluster cov_in: covariance matrix z_val: sensitive feature value class_label: +1 or -1 n: number of points """ nv = multivariate_normal(mean = mean_in, cov = cov_in) X = nv.rvs(n) y = np.ones(n, dtype=float) * class_label z = np.ones(n, dtype=float) * z_val # all the points in this cluster get this value of the sensitive attribute return nv, X, y, z if data_type == 1: """ Generate data such that a classifier optimizing for accuracy will have disparate false positive rates as well as disparate false negative rates for both groups. """ cc = [[10,1], [1,4]] mu1, sigma1 = [2, 3], cc # z=1, + cc = [[5,2], [2,5]] mu2, sigma2 = [1, 2], cc # z=0, + cc = [[5, 1], [1, 5]] mu3, sigma3 = [-5,0], cc # z=1, - cc = [[7, 1], [1, 7]] mu4, sigma4 = [0,-1], cc # z=0, - nv1, X1, y1, z1 = gen_gaussian_diff_size(mu1, sigma1, 1, +1, int(n_samples * 1) ) # z=1, + nv2, X2, y2, z2 = gen_gaussian_diff_size(mu2, sigma2, 0, +1, int(n_samples * 1) ) # z=0, + nv3, X3, y3, z3 = gen_gaussian_diff_size(mu3, sigma3, 1, -1, int(n_samples * 1) ) # z=1, - nv4, X4, y4, z4 = gen_gaussian_diff_size(mu4, sigma4, 0, -1, int(n_samples * 1) ) # z=0, - elif data_type == 2: """ Generate data such that a classifier optimizing for accuracy will have disparate false positive rates for both groups but will have equal false negative rates. """ cc = [[3,1], [1,3]] mu1, sigma1 = [2, 2], cc # z=1, + mu2, sigma2 = [2, 2], cc # z=0, + mu3, sigma3 = [-2,-2], cc # z=1, - cc = [[3,3], [1,3]] mu4, sigma4 = [-1,0], cc # z=0, - nv1, X1, y1, z1 = gen_gaussian_diff_size(mu1, sigma1, 1, +1, int(n_samples * 1) ) # z=1, + nv2, X2, y2, z2 = gen_gaussian_diff_size(mu2, sigma2, 0, +1, int(n_samples * 1) ) # z=0, + nv3, X3, y3, z3 = gen_gaussian_diff_size(mu3, sigma3, 1, -1, int(n_samples * 1) ) # z=1, - nv4, X4, y4, z4 = gen_gaussian_diff_size(mu4, sigma4, 0, -1, int(n_samples * 1) ) # z=0, - # merge the clusters X = np.vstack((X1, X2, X3, X4)) y = np.hstack((y1, y2, y3, y4)) x_control = np.hstack((z1, z2, z3, z4)) # shuffle the data perm = range(len(X)) shuffle(perm) X = X[perm] y = y[perm] x_control = x_control[perm] """ Plot the data """ if plot_data: plt.figure() num_to_draw = 200 # we will only draw a small number of points to avoid clutter x_draw = X[:num_to_draw] y_draw = y[:num_to_draw] x_control_draw = x_control[:num_to_draw] X_s_0 = x_draw[x_control_draw == 0.0] X_s_1 = x_draw[x_control_draw == 1.0] y_s_0 = y_draw[x_control_draw == 0.0] y_s_1 = y_draw[x_control_draw == 1.0] plt.scatter(X_s_0[y_s_0==1.0][:, 0], X_s_0[y_s_0==1.0][:, 1], color='green', marker='x', s=60, linewidth=2, label= "group-0 +ve") plt.scatter(X_s_0[y_s_0==-1.0][:, 0], X_s_0[y_s_0==-1.0][:, 1], color='red', marker='x', s=60, linewidth=2, label = "group-0 -ve") plt.scatter(X_s_1[y_s_1==1.0][:, 0], X_s_1[y_s_1==1.0][:, 1], color='green', marker='o', facecolors='none', s=60, linewidth=2, label = "group-1 +ve") plt.scatter(X_s_1[y_s_1==-1.0][:, 0], X_s_1[y_s_1==-1.0][:, 1], color='red', marker='o', facecolors='none', s=60, linewidth=2, label = "group-1 -ve") plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') # dont need the ticks to see the data distribution plt.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') plt.legend(loc=2, fontsize=21) plt.ylim((-8,12)) plt.savefig("img/data.png") plt.show() x_control = {"s1": x_control} # all the sensitive features are stored in a dictionary X = ut.add_intercept(X) return X,y,x_control
212, 214, 218, 220 ]] # X_train = X_train[:,[13,85,103,142,214]] # X_test = X_test[:,[13,85,103,142,214]] # one dimension at a time # 0: arousal, 1: valence y_train = y_train[:, 0] y_test = y_test[:, 0] print X_train.shape, y_train.shape, X_test.shape, y_test.shape tst_song = len(song_id_tst) # add column of ones to data to account for the bias: X_train = add_intercept(X_train) print X_train.shape # print X_train[0:10] # Theano symbolic definitions X = T.vector() Y = T.scalar() lr = T.scalar('learning rate') regul = T.scalar('L2 regul. coeff') def model(X, w): return T.tanh(T.dot(X, w)) # return 2.0*T.nnet.sigmoid(T.dot(X, w)) - 1.0 # return T.erf(T.dot(X,w))
# Prob > F = 0.0000 # R-squared = 0.7862 # Adj R-squared = 0.7679 # Within R-sq. = 0.7586 # Root MSE = 6.7671 # # ------------------------------------------------------------------------------ # rating | Coef. Std. Err. t P>|t| [95% Conf. Interval] # -------------+---------------------------------------------------------------- # fat | -5.684196 .8801468 -6.46 0.000 -7.439594 -3.928799 # protein | 3.740386 .8430319 4.44 0.000 2.059012 5.42176 # carbo | -.7892276 .2041684 -3.87 0.000 -1.196429 -.3820266 # sugars | -2.03286 .2179704 -9.33 0.000 -2.467588 -1.598132 # _cons | 64.49503 4.92674 13.09 0.000 54.66896 74.3211 # ------------------------------------------------------------------------------ # # Absorbed degrees of freedom: # -----------------------------------------------------+ # Absorbed FE | Categories - Redundant = Num. Coefs | # -------------+---------------------------------------| # shelf | 3 0 3 | # -----------------------------------------------------+ residualized = algo.residualize( get_np_columns(df, ['rating', 'fat', 'protein', 'carbo', 'sugars'], False)) model = sm.OLS(residualized[:, 0], add_intercept(residualized[:, [1, 2, 3, 4]])) results = model.fit() print("rating ~ fat + protein + carbo + sugars, absorb(shelf)") print(results.summary())