'init_sequential_rounding_max_runtime': 10.0, # max runtime for SeqRd in initialization procedure 'init_sequential_rounding_max_solutions': 5, # max solutions to round using SeqRd # 'init_polishing_after': True, # polish after rounding 'init_polishing_max_runtime': 30.0, # max runtime for polishing 'init_polishing_max_solutions': 5, # max solutions to polish # # CPLEX Solver Parameters 'cplex_randomseed': 0, # random seed 'cplex_mipemphasis': 0, # cplex MIP strategy } # train model using lattice_cpa model_info, mip_info, lcpa_info = run_lattice_cpa(data, constraints, settings) #model info contains key results pprint(model_info) print_model(model_info['solution'], data) # mip_output contains information to access the MIP mip_info['risk_slim_mip'] #CPLEX mip mip_info['risk_slim_idx'] #indices of the relevant constraints # lcpa_output contains detailed information about LCPA pprint(lcpa_info)
def risk_cv(KY_x, KY_y, FL_x, FL_y, y_label, max_coef, max_coef_number, max_offset, max_runtime, c, seed): FL_score = [] KY_score = [] KY_validation = [] ## set up basic values cols = KY_x.columns.tolist() sample_weights = np.repeat(1, len(KY_y)) FL_x = FL_x.values FL_y[FL_y == -1] = 0 ## change -1 to 0 ## cross validation set up outer_cv = KFold(n_splits=5,shuffle=True, random_state=seed) inner_cv = KFold(n_splits=5,shuffle=True, random_state=seed) for outer_train, test in outer_cv.split(KY_x, KY_y): ## split train & test outer_train_x, outer_train_y = KY_x.iloc[outer_train], KY_y[outer_train] outer_test_x, outer_test_y = KY_x.iloc[test], KY_y[test] outer_train_sample_weights = sample_weights[outer_train] ## inner loop for inner_train, validation in inner_cv.split(outer_train_x, outer_train_y): ## split inner train & validation inner_train_x, inner_train_y = outer_train_x.iloc[inner_train].values, outer_train_y[inner_train] validation_x, validation_y = outer_train_x.iloc[validation].values, outer_train_y[validation] inner_train_sample_weights = outer_train_sample_weights[inner_train] validation_sample_weights = outer_train_sample_weights[validation] inner_train_y = inner_train_y.reshape(-1,1) ## new data new_train_data = { 'X': inner_train_x, 'Y': inner_train_y, 'variable_names': cols, 'outcome_name': y_label, 'sample_weights': inner_train_sample_weights } ## modeling model_info, mip_info, lcpa_info = risk_slim(new_train_data, max_coefficient=max_coef, max_L0_value=max_coef_number, c0_value=c, max_runtime=max_runtime, max_offset = max_offset) ## check validation auc validation_x = validation_x[:,1:] ## remove the first column, which is "intercept" validation_y[validation_y == -1] = 0 ## change -1 to 0 validation_prob = riskslim_prediction(validation_x, np.array(cols), model_info) KY_validation.append(roc_auc_score(validation_y, validation_prob)) ## outer loop outer_train_x = outer_train_x.values outer_train_y = outer_train_y.reshape(-1,1) ## new data new_train_data = { 'X': outer_train_x, 'Y': outer_train_y, 'variable_names': cols, 'outcome_name': y_label, 'sample_weights': outer_train_sample_weights } ## fit the model model_info, mip_info, lcpa_info = risk_slim(new_train_data, max_coefficient=max_coef, max_L0_value=max_coef_number, c0_value=c, max_runtime=max_runtime, max_offset = max_offset) print_model(model_info['solution'], new_train_data) ## FL_score FL_prob = riskslim_prediction(FL_x, np.array(cols), model_info).reshape(-1,1) FL_score.append(roc_auc_score(FL_y, FL_prob)) ## KY score outer_test_x = outer_test_x.values[:, 1:] outer_test_y[outer_test_y == -1] = 0 ## change -1 to 0 KY_prob = riskslim_prediction(outer_test_x, np.array(cols), model_info).reshape(-1,1) KY_score.append(roc_auc_score(outer_test_y, KY_prob)) return {'FL_score': FL_score, 'KY_score': KY_score, 'KY_validation': KY_validation}
def risk_nested_cv_constrain(X, Y, indicator, y_label, max_coef, max_coef_number, max_runtime, max_offset, c, seed): ## set up data sample_weights = np.repeat(1, len(Y)) ## set up cross validation outer_cv = KFold(n_splits=5, random_state=seed, shuffle=True) inner_cv = KFold(n_splits=5, random_state=seed, shuffle=True) train_auc = [] validation_auc = [] test_auc = [] holdout_with_attrs_test = [] holdout_probability = [] holdout_prediction = [] holdout_y = [] confusion_matrix_rets = [] calibrations = [] race_auc = [] condition_pn = [] no_condition_pn = [] i = 0 for outer_train, outer_test in outer_cv.split(X, Y): outer_train_x, outer_train_y = X.iloc[outer_train], Y[outer_train] outer_test_x, outer_test_y = X.iloc[outer_test], Y[outer_test] outer_train_sample_weight, outer_test_sample_weight = sample_weights[ outer_train], sample_weights[outer_test] ## holdout test holdout_with_attrs = outer_test_x.copy().drop(['(Intercept)'], axis=1) holdout_with_attrs = holdout_with_attrs.rename(columns={'sex1': 'sex'}) ## remove unused feature in modeling if indicator == 1: outer_train_x = outer_train_x.drop([ 'person_id', 'screening_date', 'race', 'age_at_current_charge', 'p_charges' ], axis=1) outer_test_x = outer_test_x.drop([ 'person_id', 'screening_date', 'race', 'age_at_current_charge', 'p_charges' ], axis=1) else: outer_train_x = outer_train_x.drop([ 'person_id', 'screening_date', 'race', 'sex1', 'age_at_current_charge', 'p_charges' ], axis=1) outer_test_x = outer_test_x.drop([ 'person_id', 'screening_date', 'race', 'sex1', 'age_at_current_charge', 'p_charges' ], axis=1) cols = outer_train_x.columns.tolist() ## inner cross validation for inner_train, validation in inner_cv.split(outer_train_x, outer_train_y): ## subset train data & store test data inner_train_x, inner_train_y = outer_train_x.iloc[ inner_train].values, outer_train_y[inner_train] validation_x, validation_y = outer_train_x.iloc[ validation].values, outer_train_y[validation] inner_train_sample_weight = outer_train_sample_weight[inner_train] validation_sample_weight = outer_train_sample_weight[validation] inner_train_y = inner_train_y.reshape(-1, 1) ## create new data dictionary new_train_data = { 'X': inner_train_x, 'Y': inner_train_y, 'variable_names': cols, 'outcome_name': y_label, 'sample_weights': inner_train_sample_weight } ## fit the model model_info, mip_info, lcpa_info = risk_slim_constrain( new_train_data, max_coefficient=max_coef, max_L0_value=max_coef_number, c0_value=c, max_runtime=max_runtime, max_offset=max_offset) ## check validation auc validation_x = validation_x[:, 1:] ## remove the first column, which is "intercept" validation_y[validation_y == -1] = 0 ## change -1 to 0 validation_prob = riskslim_prediction(validation_x, np.array(cols), model_info) validation_auc.append(roc_auc_score(validation_y, validation_prob)) ## outer loop outer_train_x = outer_train_x.values outer_test_x = outer_test_x.values outer_train_y = outer_train_y.reshape(-1, 1) new_train_data = { 'X': outer_train_x, 'Y': outer_train_y, 'variable_names': cols, 'outcome_name': y_label, 'sample_weights': outer_train_sample_weight } ## fit the model model_info, mip_info, lcpa_info = risk_slim_constrain( new_train_data, max_coefficient=max_coef, max_L0_value=max_coef_number, c0_value=c, max_runtime=max_runtime, max_offset=max_offset) print_model(model_info['solution'], new_train_data) ## change data format outer_train_x, outer_test_x = outer_train_x[:, 1:], outer_test_x[:, 1:] ## remove the first column, which is "intercept" outer_train_y[outer_train_y == -1] = 0 ## change -1 to 0 outer_test_y[outer_test_y == -1] = 0 ## change -1 to 0 ## probability & accuracy outer_train_prob = riskslim_prediction(outer_train_x, np.array(cols), model_info).reshape(-1, 1) outer_test_prob = riskslim_prediction(outer_test_x, np.array(cols), model_info) outer_test_pred = (outer_test_prob > 0.5) ######################## ## AUC train_auc.append(roc_auc_score(outer_train_y, outer_train_prob)) test_auc.append(roc_auc_score(outer_test_y, outer_test_prob)) ######################## ## confusion matrix confusion_matrix_fairness = compute_confusion_matrix_stats( df=holdout_with_attrs, preds=outer_test_pred, labels=outer_test_y, protected_variables=["sex", "race"]) cf_final = confusion_matrix_fairness.assign( fold_num=[i] * confusion_matrix_fairness['Attribute'].count()) confusion_matrix_rets.append(cf_final) ######################## ## calibration matrix calibration = compute_calibration_fairness( df=holdout_with_attrs, probs=outer_test_prob, labels=outer_test_y, protected_variables=["sex", "race"]) calibration_final = calibration.assign( fold_num=[i] * calibration['Attribute'].count()) calibrations.append(calibration_final) ######################## ## race auc try: race_auc_matrix = fairness_in_auc(df=holdout_with_attrs, probs=outer_test_prob, labels=outer_test_y) race_auc_matrix_final = race_auc_matrix.assign( fold_num=[i] * race_auc_matrix['Attribute'].count()) race_auc.append(race_auc_matrix_final) except: pass ######################## ## ebm_pn no_condition_pn_matrix = balance_positive_negative( df=holdout_with_attrs, probs=outer_test_prob, labels=outer_test_y) no_condition_pn_matrix_final = no_condition_pn_matrix.assign( fold_num=[i] * no_condition_pn_matrix['Attribute'].count()) no_condition_pn.append(no_condition_pn_matrix_final) ######################## ## ebm_condition_pn condition_pn_matrix = conditional_balance_positive_negative( df=holdout_with_attrs, probs=outer_test_prob, labels=outer_test_y) condition_pn_matrix_final = condition_pn_matrix.assign( fold_num=[i] * condition_pn_matrix['Attribute'].count()) condition_pn.append(condition_pn_matrix_final) ######################## ## store results holdout_with_attrs_test.append(holdout_with_attrs) holdout_probability.append(outer_test_prob) holdout_prediction.append(outer_test_pred) holdout_y.append(outer_test_y) i += 1 ## confusion matrix confusion_df = pd.concat(confusion_matrix_rets, ignore_index=True) confusion_df.sort_values(["Attribute", "Attribute Value"], inplace=True) confusion_df = confusion_df.reset_index(drop=True) ## calibration matrix calibration_df = pd.concat(calibrations, ignore_index=True) calibration_df.sort_values( ["Attribute", "Lower Limit Score", "Upper Limit Score"], inplace=True) calibration_df = calibration_df.reset_index(drop=True) ## race_auc race_auc_df = [] try: race_auc_df = pd.concat(race_auc, ignore_index=True) race_auc_df.sort_values(["fold_num", "Attribute"], inplace=True) race_auc_df = race_auc_df.reset_index(drop=True) except: pass ## no_condition_pn no_condition_pn_df = pd.concat(no_condition_pn, ignore_index=True) no_condition_pn_df.sort_values(["fold_num", "Attribute"], inplace=True) no_condition_pn_df = no_condition_pn_df.reset_index(drop=True) ## condition_pn condition_pn_df = pd.concat(condition_pn, ignore_index=True) condition_pn_df.sort_values(["fold_num", "Attribute"], inplace=True) condition_pn_df = condition_pn_df.reset_index(drop=True) return { 'train_auc': train_auc, 'validation_auc': validation_auc, 'test_auc': test_auc, 'holdout_with_attrs_test': holdout_with_attrs_test, 'holdout_proba': holdout_probability, 'holdout_pred': holdout_prediction, 'holdout_y': holdout_y, 'confusion_matrix_stats': confusion_df, 'calibration_stats': calibration_df, 'race_auc': race_auc_df, 'condition_pn': condition_pn_df, 'no_condition_pn': no_condition_pn_df }
def risk_cv(X, Y, indicator, y_label, max_coef, max_coef_number, max_runtime, max_offset, c, seed): ## set up data Y = Y.reshape(-1, 1) sample_weights = np.repeat(1, len(Y)) ## set up cross validation cv = KFold(n_splits=5, random_state=seed, shuffle=True) train_auc = [] validation_auc = [] i = 0 for train, validation in cv.split(X, Y): ## subset train data & store test data train_x, train_y = X.iloc[train], Y[train] validation_x, validation_y = X.iloc[validation], Y[validation] sample_weights_train, sample_weights_validation = sample_weights[ train], sample_weights[validation] ## holdout test with "race" for fairness holdout_with_attrs = validation_x.copy().drop(['(Intercept)'], axis=1) holdout_with_attrs = holdout_with_attrs.rename(columns={'sex1': 'sex'}) ## remove unused feature in modeling if indicator == 1: train_x = train_x.drop([ 'person_id', 'screening_date', 'race', 'age_at_current_charge', 'p_charges' ], axis=1) validation_x = validation_x.drop([ 'person_id', 'screening_date', 'race', 'age_at_current_charge', 'p_charges' ], axis=1).values else: train_x = train_x.drop([ 'person_id', 'screening_date', 'race', 'sex1', 'age_at_current_charge', 'p_charges' ], axis=1) validation_x = validation_x.drop([ 'person_id', 'screening_date', 'race', 'sex1', 'age_at_current_charge', 'p_charges' ], axis=1).values cols = train_x.columns.tolist() train_x = train_x.values ## create new data dictionary new_train_data = { 'X': train_x, 'Y': train_y, 'variable_names': cols, 'outcome_name': y_label, 'sample_weights': sample_weights_train } ## fit the model model_info, mip_info, lcpa_info = risk_slim( new_train_data, max_coefficient=max_coef, max_L0_value=max_coef_number, max_offset=max_offset, c0_value=c, max_runtime=max_runtime) print_model(model_info['solution'], new_train_data) ## change data format train_x, validation_x = train_x[:, 1:], validation_x[:, 1:] ## remove the first column, which is "intercept" train_y[train_y == -1] = 0 ## change -1 to 0 validation_y[validation_y == -1] = 0 ## change -1 to 0 ## probability & accuracy train_prob = riskslim_prediction(train_x, np.array(cols), model_info).reshape(-1, 1) validation_prob = riskslim_prediction(validation_x, np.array(cols), model_info).reshape(-1, 1) validation_pred = (validation_prob > 0.5) ## AUC train_auc.append(roc_auc_score(train_y, train_prob)) validation_auc.append(roc_auc_score(validation_y, validation_prob)) i += 1 return {'train_auc': train_auc, 'validation_auc': validation_auc}
def fit(self, X, y): X, y = check_X_y(X, y, accept_sparse=True) self.is_fitted_ = True # transforming data raw_data = np.insert(X, 0, y, axis=1) N = raw_data.shape[0] # setup Y vector and Y_name Y_col_idx = [0] Y = raw_data[:, Y_col_idx] Y_name = self.data_headers[Y_col_idx[0]] Y[Y == 0] = -1 # setup X and X_names X_col_idx = [j for j in range(raw_data.shape[1]) if j not in Y_col_idx] X = raw_data[:, X_col_idx] variable_names = [self.data_headers[j] for j in X_col_idx] # insert a column of ones to X for the intercept X = np.insert(arr=X, obj=0, values=np.ones(N), axis=1) variable_names.insert(0, '(Intercept)') if self.sample_weights is None or len(self.sample_weights) != N: self.sample_weights = np.ones(N) self.data = { 'X': X, 'Y': Y, 'variable_names': variable_names, 'outcome_name': Y_name, 'sample_weights': self.sample_weights, } #load folds if self.fold_csv_file is not None: if not os.path.isfile(self.fold_csv_file): raise IOError('could not find fold_csv_file: %s' % self.fold_csv_file) else: fold_idx = pd.read_csv(self.fold_csv_file, sep=',', header=None) fold_idx = fold_idx.values.flatten() K = max(fold_idx) all_fold_nums = np.sort(np.unique(fold_idx)) assert len( fold_idx ) == N, "dimension mismatch: read %r fold indices (expected N = %r)" % ( len(fold_idx), N) assert np.all(all_fold_nums == np.arange( 1, K + 1)), "folds should contain indices between 1 to %r" % K assert fold_num in np.arange( 0, K + 1 ), "fold_num should either be 0 or an integer between 1 to %r" % K if fold_num >= 1: test_idx = fold_num == fold_idx train_idx = fold_num != fold_idx data['X'] = data['X'][train_idx, ] data['Y'] = data['Y'][train_idx] data['sample_weights'] = data['sample_weights'][train_idx] assert check_data(self.data) # create coefficient set and set the value of the offset parameter coef_set = CoefficientSet(variable_names=self.data['variable_names'], lb=-self.max_coefficient, ub=self.max_coefficient, sign=0) conservative_offset = get_conservative_offset(self.data, coef_set, self.max_L0_value) self.max_offset = min(self.max_offset, conservative_offset) coef_set['(Intercept)'].ub = self.max_offset coef_set['(Intercept)'].lb = -self.max_offset # edit contraints here constraints = { 'L0_min': 0, 'L0_max': self.max_L0_value, 'coef_set': coef_set, } # initialize MIP for lattice CPA mip_objects = setup_lattice_cpa(self.data, constraints, self.settings) # add operational constraints mip = mip_objects['mip'] indices = mip_objects['indices'] get_alpha_name = lambda var_name: 'alpha_' + str(self.data[ 'variable_names'].index(var_name)) get_alpha_ind = lambda var_names: [ get_alpha_name(v) for v in var_names ] # applies mutual exclusivity feature contraints if self.op_constraints is not None: names = [] expressions = [] for key in self.op_constraints.keys(): names.append("mutually_exclusive_%s" % key) expressions.append( cplex.SparsePair( ind=get_alpha_ind(self.op_constraints[key]), val=[1.0] * len(self.op_constraints[key]))) mip.linear_constraints.add( names=names, lin_expr=expressions, senses=["L"] * len(self.op_constraints.keys()), rhs=[1.0] * len(self.op_constraints.keys())) mip_objects['mip'] = mip # fit using ltca model_info, mip_info, lcpa_info = finish_lattice_cpa( self.data, constraints, mip_objects, self.settings) rho = model_info['solution'] self.model_info = model_info if np.sum(rho[1:]) != 0: print_model(model_info['solution'], self.data) print("solver_time = %d" % model_info['solver_time']) print("optimality_gap = %.3f" % model_info['optimality_gap']) print(rho) variable_names = self.data['variable_names'] rho_values = np.copy(rho) rho_names = list(variable_names) # removes intercept value or sets it to 0 if '(Intercept)' in rho_names: intercept_ind = variable_names.index('(Intercept)') self.intercept_val = int(rho[intercept_ind]) rho_values = np.delete(rho_values, intercept_ind) rho_names.remove('(Intercept)') else: self.intercept_val = 0 self.filter_mask = np.array(rho_values) != 0 # removes zero values if not self.show_omitted_variables: selected_ind = np.flatnonzero(rho_values) self.rho_values = rho_values[selected_ind] self.rho_names = [rho_names[i] for i in selected_ind] return self