def f_clf1(hps): # Assembing pipeline #weights = [hps['acc_w'], hps['fair_w'], hps['rob_w']] #weights = [0.0, 1.0, 0.0] #rankings = [accuracy_ranking, fairness_ranking, robustness_ranking] #weights = [hps['var_w']] #rankings = [variance_ranking] #weights = [hps['acc_w']] #rankings = [accuracy_ranking] #weights = [hps['fair_w']] #rankings = [fairness_ranking] weights = [hps['acc_w'], hps['fair_w'], hps['var_w']] rankings = [accuracy_ranking, fairness_ranking, variance_ranking] mask = np.zeros(len(hps) - 5, dtype=bool) for k, v in hps.items(): if k.startswith('f_'): mask[int(k.split('_')[1])] = v clf = LogisticRegression() if type(privacy_epsilon) != type(None): clf = models.LogisticRegression(epsilon=privacy_epsilon) model = Pipeline([('selection', WeightedRankingSelection(scores=rankings, weights=weights, k=hps['k'] + 1, names=np.array(names), hyperparameter_mask=mask)), ('clf', clf)]) return model
def f_clf1(hps): # Assembing pipeline #weights = [hps['acc_w'], hps['fair_w'], hps['rob_w']] #weights = [0.0, 1.0, 0.0] #rankings = [accuracy_ranking, fairness_ranking, robustness_ranking] #weights = [hps['var_w']] #rankings = [variance_ranking] #weights = [hps['acc_w']] #rankings = [accuracy_ranking] #weights = [hps['fair_w']] #rankings = [fairness_ranking] weights = [hps['acc_w'], hps['fair_w'], hps['var_w']] rankings = [accuracy_ranking, fairness_ranking, variance_ranking] #weights = [hps['acc_w'], hps['fair_w'], hps['var_w'], hps['rob_w']] #rankings = [accuracy_ranking, fairness_ranking, variance_ranking, robustness_ranking] clf = LogisticRegression() if type(privacy_epsilon) != type(None): clf = models.LogisticRegression(epsilon=privacy_epsilon) model = Pipeline([('selection', WeightedRankingSelection(scores=rankings, weights=weights, k=hps['k'] + 1, names=np.array(names))), ('clf', clf)]) return model
def get_model(c): #return ('clf', LogisticRegression(class_weight='balanced', C=c)) #return ('clf', RandomForestClassifier(class_weight='balanced', n_estimators=c)) return ('clf', models.LogisticRegression(epsilon=0.00001, class_weight='balanced', C=c))
def f_clf1(hps): # Assembing pipeline weights = [hps['acc_w'], hps['fair_w'], hps['rob_w']] #weights = [0.0, 1.0, 0.0] rankings = [accuracy_ranking, fairness_ranking, robustness_ranking] clf = LogisticRegression() if type(privacy_epsilon) != type(None): clf = models.LogisticRegression( epsilon=privacy_epsilon ) #The smaller the value is, the better privacy protection model = Pipeline([('selection', WeightedRankingSelection(scores=rankings, weights=weights, k=hps['k'] + 1, names=np.array(names))), ('clf', clf)]) return model
min_robustness = 0.0 if most_uncertain_f['robustness_choice'][0]: min_robustness = most_uncertain_f['robustness_specified'][0] max_number_features = 1.0 if most_uncertain_f['k_choice'][0]: max_number_features = most_uncertain_f['k_specified'][0] max_search_time = most_uncertain_f['search_time_specified'][0] # Execute each search strategy with a given time limit (in parallel) # maybe run multiple times to smooth stochasticity model = LogisticRegression(class_weight='balanced') if most_uncertain_f['privacy_choice'][0]: model = models.LogisticRegression( epsilon=most_uncertain_f['privacy_specified'][0], class_weight='balanced') mp_global.clf = model #define rankings rankings = [ variance, chi2_score_wo, f_anova_wo, fcbf, my_fisher_score, mutual_info_classif, my_mcfs ] #rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking #rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking #rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking rankings.append(partial(model_score, estimator=ReliefF(n_neighbors=10))) # relieff mp_global.min_accuracy = min_accuracy
def query(self, X_train, X_validation, X_test, y_train, y_validation, y_test, classifier=LogisticRegression(class_weight='balanced'), min_accuracy=0.5, sensitive_ids=None, min_fairness=0.0, min_safety=0.0, min_privacy=None, max_complexity=1.0, max_search_time=np.inf, feature_names=None): if isinstance(max_complexity, int): max_complexity = max_complexity / float(X_train.shape[1]) X_train_val = np.vstack((X_train, X_validation)) y_train_val = np.append(y_train, y_validation) self.feature_names = feature_names if type(min_privacy) != type(None): classifier = models.LogisticRegression(epsilon=min_privacy, class_weight='balanced') self.stored_results_file = '/tmp/experiment' + str( time.time()) + '.pickle' mp_global.X_train = X_train mp_global.X_validation = X_validation mp_global.X_train_val = X_train_val mp_global.X_test = X_test mp_global.y_train = y_train mp_global.y_validation = y_validation mp_global.y_train_val = y_train_val mp_global.y_test = y_test mp_global.names = feature_names mp_global.sensitive_ids = sensitive_ids mp_global.min_accuracy = min_accuracy mp_global.min_fairness = min_fairness mp_global.min_robustness = min_safety mp_global.max_number_features = max_complexity mp_global.max_search_time = max_search_time mp_global.clf = classifier mp_global.log_file = self.stored_results_file configuration = {} configuration['ranking_functions'] = copy.deepcopy( self.ranking_functions) configuration['run_id'] = 0 configuration['main_strategy'] = copy.deepcopy(self.selection_function) mp_global.configurations = [configuration] with ProcessPool(max_workers=1) as pool: future = pool.map(my_function, range(len(mp_global.configurations)), timeout=max_search_time) iterator = future.result() while True: try: result = next(iterator) except StopIteration: break except TimeoutError as error: print("function took longer than %d seconds" % error.args[1]) except ProcessExpired as error: print("%s. Exit code: %d" % (error, error.exitcode)) except Exception as error: print("function raised %s" % error) #print(error.traceback) # Python's traceback of remote process return self.get_satisfying_features()
results_heatmap = {} for min_accuracy in np.arange(l_acc, u_acc, (u_acc - l_acc) / 10.0): for max_number_features in np.arange(start_features, 1.0 + (1.0 - start_features) / 10.0, (1.0 - start_features) / 10.0): i += 1 min_robustness = 0.0 max_search_time = 20 * 60 privacy = None min_fairness = 0.0 # Execute each search strategy with a given time limit (in parallel) # maybe run multiple times to smooth stochasticity model = LogisticRegression() if type(privacy) != type(None): model = models.LogisticRegression(epsilon=privacy) mp_global.clf = model #define rankings rankings = [variance, chi2_score_wo, fcbf, my_fisher_score, mutual_info_classif, my_mcfs] #rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking #rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking #rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking rankings.append(partial(model_score, estimator=ReliefF(n_neighbors=10))) # relieff mp_global.min_accuracy = min_accuracy
def run_strategy(strategy_method, ranking_id, strategy_id): data_infos = pickle.load( open( Config.get('data_path') + '/openml_data/fitting_datasets.pickle', 'rb')) time_limit = 60 * 20 meta_classifier = RandomForestRegressor(n_estimators=1000) X_train_meta_classifier = [] y_train_meta_classifier = [] cv_splitter = StratifiedKFold(5, random_state=42) auc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True) acc_value_list = [] fair_value_list = [] robust_value_list = [] success_value_list = [] runtime_value_list = [] dataset_did_list = [] dataset_sensitive_attribute_list = [] while True: X_train, X_test, y_train, y_test, names, sensitive_ids, data_did, sensitive_attribute_id = get_data_openml( data_infos) #run on tiny sample X_train_tiny, _, y_train_tiny, _ = train_test_split(X_train, y_train, train_size=100, random_state=42, stratify=y_train) fair_train_tiny = make_scorer( true_positive_rate_score, greater_is_better=True, sensitive_data=X_train_tiny[:, sensitive_ids[0]]) def objective(hps): print(hps) try: cv_k = 1.0 cv_privacy = hps['privacy'] model = LogisticRegression() if type(cv_privacy) == type(None): cv_privacy = X_train_tiny.shape[0] else: model = models.LogisticRegression(epsilon=cv_privacy) robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train_tiny, y=y_train_tiny, model=model, feature_selector=None, scorer=auc_scorer) cv = GridSearchCV(model, param_grid={'C': [1.0]}, scoring={ 'AUC': auc_scorer, 'Fairness': fair_train_tiny, 'Robustness': robust_scorer }, refit=False, cv=cv_splitter) cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny)) cv_acc = cv.cv_results_['mean_test_AUC'][0] cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0] cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0] small_start_time = time.time() cv = GridSearchCV(model, param_grid={'C': [1.0]}, scoring={ 'AUC': auc_scorer, 'Fairness': fair_train_tiny, 'Robustness': robust_scorer }, refit=False, cv=cv_splitter) cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny)) cv_acc = cv.cv_results_['mean_test_AUC'][0] cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0] cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0] # construct feature vector feature_list = [] # user-specified constraints feature_list.append(hps['accuracy']) feature_list.append(hps['fairness']) feature_list.append(hps['k']) feature_list.append(hps['k'] * X_train.shape[1]) feature_list.append(hps['robustness']) feature_list.append(cv_privacy) feature_list.append(hps['search_time']) # differences to sample performance feature_list.append(cv_acc - hps['accuracy']) feature_list.append(cv_fair - hps['fairness']) feature_list.append(cv_k - hps['k']) feature_list.append((cv_k - hps['k']) * X_train.shape[1]) feature_list.append(cv_robust - hps['robustness']) feature_list.append(time.time() - small_start_time) # privacy constraint is always satisfied => difference always zero => constant => unnecessary # metadata features feature_list.append(X_train.shape[0]) # number rows feature_list.append(X_train.shape[1]) # number columns features = np.array(feature_list) #predict the best model and calculate uncertainty loss = 0 if hasattr(meta_classifier, 'estimators_'): predictions = [] for tree in range(len(meta_classifier.estimators_)): predictions.append( meta_classifier.estimators_[tree].predict( [features])[0]) stddev = np.std(np.array(predictions), axis=0) print('stddev: ' + str(stddev)) loss = (stddev**2) * -1 return { 'loss': loss, 'status': STATUS_OK, 'features': features } except: return {'loss': np.inf, 'status': STATUS_OK} space = { 'k': hp.choice('k_choice', [(1.0), (hp.uniform('k_specified', 0, 1))]), 'accuracy': hp.uniform('accuracy_specified', 0.5, 1), 'fairness': hp.choice('fairness_choice', [(0.0), (hp.uniform('fairness_specified', 0, 1))]), 'privacy': hp.choice('privacy_choice', [(None), (hp.lognormal('privacy_specified', 0, 1))]), 'robustness': hp.choice('robustness_choice', [(0.0), (hp.uniform('robustness_specified', 0, 1))]), 'search_time': hp.uniform('search_time_specified', 10, time_limit), # in seconds } trials = Trials() runs_per_dataset = 0 i = 1 while True: fmin(objective, space=space, algo=tpe.suggest, max_evals=i, trials=trials) i += 1 if trials.trials[-1]['result']['loss'] == np.inf: break #break, once convergence tolerance is reached and generate new dataset if trials.trials[-1]['result']['loss'] == 0 or i % 20 == 0: best_trial = trials.trials[-1] if i % 20 == 0: best_trial = trials.best_trial most_uncertain_f = best_trial['misc']['vals'] #print(most_uncertain_f) min_accuracy = most_uncertain_f['accuracy_specified'][0] min_fairness = 0.0 if most_uncertain_f['fairness_choice'][0]: min_fairness = most_uncertain_f['fairness_specified'][0] min_robustness = 0.0 if most_uncertain_f['robustness_choice'][0]: min_robustness = most_uncertain_f['robustness_specified'][ 0] max_number_features = X_train.shape[1] if most_uncertain_f['k_choice'][0]: max_number_features = most_uncertain_f['k_specified'][0] max_search_time = most_uncertain_f['search_time_specified'][0] # Execute each search strategy with a given time limit (in parallel) # maybe run multiple times to smooth stochasticity model = LogisticRegression() if most_uncertain_f['privacy_choice'][0]: model = models.LogisticRegression( epsilon=most_uncertain_f['privacy_specified'][0]) rankings = [variance, chi2_score_wo] # simple rankings rankings.append( partial(model_score, estimator=ExtraTreesClassifier( n_estimators=1000))) # accuracy ranking rankings.append( partial(robustness_score, model=model, scorer=auc_scorer)) # robustness ranking rankings.append( partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) # fairness ranking selected_rankings = rankings if type(ranking_id) != type(None): selected_rankings = [rankings[ranking_id]] result = strategy_method( X_train, X_test, y_train, y_test, names, sensitive_ids, ranking_functions=selected_rankings, clf=model, min_accuracy=min_accuracy, min_fairness=min_fairness, min_robustness=min_robustness, max_number_features=max_number_features, max_search_time=max_search_time, cv_splitter=cv_splitter) # append ml data X_train_meta_classifier.append( best_trial['result']['features']) y_train_meta_classifier.append(result['time']) try: meta_classifier.fit(np.array(X_train_meta_classifier), y_train_meta_classifier) except: pass #pickle everything and store it one_big_object = {} one_big_object['features'] = X_train_meta_classifier #one_big_object['best_strategy'] = y_train_meta_classifier runtime_value_list.append(result['time']) acc_value_list.append(result['cv_acc']) fair_value_list.append(result['cv_fair']) robust_value_list.append(result['cv_robust']) success_value_list.append(result['success']) dataset_did_list.append(data_did) dataset_sensitive_attribute_list.append(sensitive_attribute_id) one_big_object['times_value'] = runtime_value_list one_big_object['acc_value'] = acc_value_list one_big_object['fair_value'] = fair_value_list one_big_object['robust_value'] = robust_value_list one_big_object['success_value'] = success_value_list one_big_object['dataset_id'] = dataset_did_list one_big_object[ 'sensitive_attribute_id'] = dataset_sensitive_attribute_list pickle.dump( one_big_object, open( '/tmp/metalearning_data' + str(strategy_id) + '.pickle', 'wb')) trials = Trials() i = 1 runs_per_dataset += 1 break
for fname_i in range(len(all_names)): if all_names[fname_i].startswith('onehot__x' + str(cat_sensitive_attribute_id) + '_'): sensitive_ids.append(fname_i) print(sensitive_ids) le = preprocessing.LabelEncoder() le.fit(y_train) y_train = le.fit_transform(y_train) y_test = le.transform(y_test) cv_splitter = StratifiedKFold(5, random_state=42) model = models.LogisticRegression(epsilon=10) evolution(X_train, X_test, y_train, y_test, names, sensitive_ids, ranking_functions=[], clf=model, min_accuracy=1.0, min_fairness=0.0, min_robustness=0.0, max_number_features=1.0, cv_splitter=cv_splitter)
def uncertainty_sampling(training_dataset_ids, all_current_models): time_limit = 30 * 60 #60 * 60 * 3 training_dataset_ids = training_dataset_ids.tolist() if '1240' in training_dataset_ids: training_dataset_ids.remove('1240') if '42132' in training_dataset_ids: training_dataset_ids.remove('42132') def maximize_uncertainty(hps): print(hps) X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation( dataset_key=hps['data']) is_regression = False # run on tiny sample if X_train.shape[0] > 100: if is_regression: X_train_tiny, _, y_train_tiny, _ = train_test_split( X_train, y_train, train_size=100, random_state=42) else: X_train_tiny, _, y_train_tiny, _ = train_test_split( X_train, y_train, train_size=100, random_state=42, stratify=y_train) else: X_train_tiny = X_train y_train_tiny = y_train print(X_train.shape) if type(sensitive_ids) != type(None): fair_train_tiny = make_scorer( true_positive_rate_score, greater_is_better=True, sensitive_data=X_train_tiny[:, sensitive_ids[0]]) mp_global.X_train = X_train mp_global.X_validation = X_validation mp_global.X_train_val = X_train_val mp_global.X_test = X_test mp_global.y_train = y_train mp_global.y_validation = y_validation mp_global.y_train_val = y_train_val mp_global.y_test = y_test mp_global.names = names mp_global.sensitive_ids = sensitive_ids mp_global.cv_splitter = StratifiedKFold(5, random_state=42) mp_global.accuracy_scorer = make_scorer(f1_score) mp_global.avoid_robustness = False cv_k = 1.0 cv_privacy = hps['privacy'] model = LogisticRegression(class_weight='balanced') if type(cv_privacy) != type(None): model = models.LogisticRegression(epsilon=cv_privacy, class_weight='balanced') if type(cv_privacy) == type(None): cv_privacy = X_train_tiny.shape[0] robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train_tiny, y=y_train_tiny, model=model, feature_selector=None, scorer=mp_global.accuracy_scorer) small_start_time = time.time() scoring = {'AUC': mp_global.accuracy_scorer} if not mp_global.avoid_robustness: scoring['Robustness'] = robust_scorer if type(sensitive_ids) != type(None): scoring['Fairness'] = fair_train_tiny cv = GridSearchCV(model, param_grid={}, scoring=scoring, refit=False, cv=mp_global.cv_splitter) cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny)) cv_acc = cv.cv_results_['mean_test_AUC'][0] cv_fair = 0.0 if type(sensitive_ids) != type(None): cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0] cv_robust = 0.0 if not mp_global.avoid_robustness: cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0] cv_time = time.time() - small_start_time # construct feature vector feature_list = [] # user-specified constraints feature_list.append(hps['accuracy']) feature_list.append(hps['fairness']) feature_list.append(hps['k']) feature_list.append(hps['k'] * X_train.shape[1]) feature_list.append(hps['robustness']) feature_list.append(cv_privacy) feature_list.append(hps['search_time']) # differences to sample performance feature_list.append(cv_acc - hps['accuracy']) feature_list.append(cv_fair - hps['fairness']) feature_list.append(cv_k - hps['k']) feature_list.append((cv_k - hps['k']) * X_train.shape[1]) feature_list.append(cv_robust - hps['robustness']) feature_list.append(cv_time) # privacy constraint is always satisfied => difference always zero => constant => unnecessary # metadata features feature_list.append(X_train.shape[0]) # number rows feature_list.append(X_train.shape[1]) # number columns #models feature_list.append(hps['model'] == 'Decision Tree') feature_list.append(hps['model'] == 'Gaussian Naive Bayes') feature_list.append(hps['model'] == 'Logistic Regression') features = np.array(feature_list).reshape(1, -1) # predict the best model and calculate uncertainty print(features) #now predict with models aggregated_certainty = 0 print("uncertainty") for model_i in range(len(all_current_models)): certainty = np.abs( all_current_models[model_i].predict_proba(features)[0, 0] - 0.5) aggregated_certainty += certainty print('Certainty: ' + str(aggregated_certainty)) return { 'loss': aggregated_certainty, 'status': STATUS_OK, 'features': features, 'search_time': hps['search_time'], 'constraints': hps } space = { 'data': hp.choice('data_choice', training_dataset_ids), 'model': hp.choice( 'model_choice', [ 'Logistic Regression', 'Gaussian Naive Bayes', 'Decision Tree' # , 'Random Forest' ]), 'k': hp.choice('k_choice', [(1.0), (hp.uniform('k_specified', 0, 1))]), 'accuracy': hp.uniform('accuracy_specified', 0.5, 1), 'fairness': hp.choice('fairness_choice', [(0.0), (hp.uniform('fairness_specified', 0.8, 1))]), 'privacy': hp.choice('privacy_choice', [(None), (hp.lognormal('privacy_specified', 0, 1))]), 'robustness': hp.choice('robustness_choice', [(0.0), (hp.uniform('robustness_specified', 0.8, 1))]), 'search_time': hp.uniform('search_time_specified', 10, time_limit), # in seconds } trials = Trials() fmin(maximize_uncertainty, space=space, algo=tpe.suggest, max_evals=100, trials=trials, show_progressbar=True) ### now run most uncertain trial number_of_runs = 1 # break, once convergence tolerance is reached and generate new dataset last_trial = trials.best_trial most_uncertain_f = last_trial['misc']['vals'] # print(most_uncertain_f) ##specifiy data run_counter = 0 current_run_time_id = time.time() path = pathlib.Path('/tmp/experiment_uncertainty/run' + str(0)) path.mkdir(parents=True, exist_ok=True) selected_dataset_id = training_dataset_ids[most_uncertain_f['data_choice'] [0]] X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation( dataset_key=selected_dataset_id) is_regression = False mp_global.X_train = X_train mp_global.X_validation = X_validation mp_global.X_train_val = X_train_val mp_global.X_test = X_test mp_global.y_train = y_train mp_global.y_validation = y_validation mp_global.y_train_val = y_train_val mp_global.y_test = y_test mp_global.names = names mp_global.sensitive_ids = sensitive_ids if is_regression: mp_global.cv_splitter = KFold(5, random_state=42) mp_global.accuracy_scorer = make_scorer(r2_score) else: mp_global.cv_splitter = StratifiedKFold(5, random_state=42) mp_global.accuracy_scorer = make_scorer(f1_score) mp_global.avoid_robustness = False min_accuracy = most_uncertain_f['accuracy_specified'][0] min_fairness = 0.0 if most_uncertain_f['fairness_choice'][0]: min_fairness = most_uncertain_f['fairness_specified'][0] min_robustness = 0.0 if most_uncertain_f['robustness_choice'][0]: min_robustness = most_uncertain_f['robustness_specified'][0] max_number_features = 1.0 if most_uncertain_f['k_choice'][0]: max_number_features = most_uncertain_f['k_specified'][0] max_search_time = most_uncertain_f['search_time_specified'][0] # Execute each search strategy with a given time limit (in parallel) # maybe run multiple times to smooth stochasticity model = None print(most_uncertain_f) if most_uncertain_f['model_choice'][0] == 0: model = LogisticRegression(class_weight='balanced') if most_uncertain_f['privacy_choice'][0]: model = models.LogisticRegression( epsilon=most_uncertain_f['privacy_specified'][0], class_weight='balanced') elif most_uncertain_f['model_choice'][0] == 1: model = GaussianNB() if most_uncertain_f['privacy_choice'][0]: model = models.GaussianNB( epsilon=most_uncertain_f['privacy_specified'][0]) elif most_uncertain_f['model_choice'][0] == 2: model = DecisionTreeClassifier(class_weight='balanced') if most_uncertain_f['privacy_choice'][0]: model = PrivateRandomForest( n_estimators=1, epsilon=most_uncertain_f['privacy_specified'][0]) elif most_uncertain_f['model_choice'][0] == 3: model = RandomForestClassifier(n_estimators=100, class_weight='balanced') if most_uncertain_f['privacy_choice'][0]: model = PrivateRandomForest( n_estimators=100, epsilon=most_uncertain_f['privacy_specified'][0]) print(model) mp_global.clf = model # define rankings rankings = [ variance, chi2_score_wo, fcbf, my_fisher_score, mutual_info_classif, my_mcfs ] # rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking # rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking # rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking rankings.append(partial(model_score, estimator=ReliefF(n_neighbors=10))) # relieff mp_global.min_accuracy = min_accuracy mp_global.min_fairness = min_fairness mp_global.min_robustness = min_robustness mp_global.max_number_features = max_number_features mp_global.max_search_time = max_search_time mp_global.configurations = [] # add single rankings strategy_id = 1 for r in range(len(rankings)): for run in range(number_of_runs): configuration = {} configuration['ranking_functions'] = copy.deepcopy([rankings[r]]) configuration['run_id'] = copy.deepcopy(run) configuration['main_strategy'] = copy.deepcopy(weighted_ranking) configuration['strategy_id'] = copy.deepcopy(strategy_id) mp_global.configurations.append(configuration) strategy_id += 1 main_strategies = [ TPE, simulated_annealing, evolution, exhaustive, forward_selection, backward_selection, forward_floating_selection, backward_floating_selection, recursive_feature_elimination, fullfeatures ] # run main strategies for strategy in main_strategies: for run in range(number_of_runs): configuration = {} configuration['ranking_functions'] = [] configuration['run_id'] = copy.deepcopy(run) configuration['main_strategy'] = copy.deepcopy(strategy) configuration['strategy_id'] = copy.deepcopy(strategy_id) mp_global.configurations.append(configuration) strategy_id += 1 # 6#17 with ProcessPool(max_workers=17) as pool: future = pool.map(my_function, range(len(mp_global.configurations)), timeout=max_search_time) iterator = future.result() while True: try: result = next(iterator) except StopIteration: break except TimeoutError as error: print("function took longer than %d seconds" % error.args[1]) except ProcessExpired as error: print("%s. Exit code: %d" % (error, error.exitcode)) except Exception as error: print("function raised %s" % error) print(error.traceback) # Python's traceback of remote process #check which strategies were successful mappnames = { 1: 'TPE(Variance)', 2: 'TPE($\chi^2$)', 3: 'TPE(FCBF)', 4: 'TPE(Fisher)', 5: 'TPE(MIM)', 6: 'TPE(MCFS)', 7: 'TPE(ReliefF)', 8: 'TPE(NR)', 9: 'SA(NR)', 10: 'NSGA-II(NR)', 11: 'ES(NR)', 12: 'SFS(NR)', 13: 'SBS(NR)', 14: 'SFFS(NR)', 15: 'SBFS(NR)', 16: 'RFE(LR)', 17: 'Complete Set' } def load_pickle(fname): data = [] with open(fname, "rb") as f: while True: try: data.append(pickle.load(f)) except EOFError: break return data def is_successfull_validation_and_test(exp_results): return len(exp_results ) > 0 and 'success_test' in exp_results[-1] and exp_results[ -1]['success_test'] == True # also on test satisfied def is_successfull_validation(exp_results): return len(exp_results) > 0 and 'Validation_Satisfied' in exp_results[ -1] # constraints were satisfied on validation set run_strategies_success_test = {} run_strategies_times = {} run_strategies_success_validation = {} rfolder = '/tmp/experiment_uncertainty/run' + str(0) + '/' validation_satisfied_by_any_strategy = False min_time = np.inf best_strategy = 0 for s in range(1, len(mappnames) + 1): exp_results = [] try: exp_results = load_pickle(rfolder + 'strategy' + str(s) + '.pickle') except: pass if is_successfull_validation_and_test(exp_results): runtime = exp_results[-1]['final_time'] if runtime < min_time: min_time = runtime best_strategy = s run_strategies_success_test[s] = True run_strategies_times[s] = runtime else: run_strategies_success_test[s] = False run_strategies_success_validation[s] = is_successfull_validation( exp_results) if run_strategies_success_validation[s]: validation_satisfied_by_any_strategy = True strategy_success = np.zeros((1, len(mappnames))) for c_i in range(len(mappnames)): strategy_success[0, c_i] = run_strategies_success_test[c_i + 1] return last_trial['result']['features'], strategy_success
def objective(hps): print(hps) cv_k = 1.0 cv_privacy = hps['privacy'] model = LogisticRegression() if type(cv_privacy) == type(None): cv_privacy = X_train_tiny.shape[0] else: model = models.LogisticRegression(epsilon=cv_privacy) robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train_tiny, y=y_train_tiny, model=model, feature_selector=None, scorer=auc_scorer) cv = GridSearchCV(model, param_grid={'C': [1.0]}, scoring={ 'AUC': auc_scorer, 'Fairness': fair_train_tiny, 'Robustness': robust_scorer }, refit=False, cv=cv_splitter) cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny)) cv_acc = cv.cv_results_['mean_test_AUC'][0] cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0] cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0] #construct feature vector feature_list = [] #user-specified constraints feature_list.append(hps['accuracy']) feature_list.append(hps['fairness']) feature_list.append(hps['k']) feature_list.append(hps['k'] * X_train.shape[1]) feature_list.append(hps['robustness']) feature_list.append(cv_privacy) #differences to sample performance feature_list.append(cv_acc - hps['accuracy']) feature_list.append(cv_fair - hps['fairness']) feature_list.append(cv_k - hps['k']) feature_list.append((cv_k - hps['k']) * X_train.shape[1]) feature_list.append(cv_robust - hps['robustness']) #privacy constraint is always satisfied => difference always zero => constant => unnecessary #metadata features feature_list.append(X_train.shape[0]) #number rows feature_list.append(X_train.shape[1]) #number columns features = np.array(feature_list) #predict the best model and calculate uncertainty loss = 0 try: proba_predictions = meta_classifier.predict_proba([features])[0] proba_predictions = np.sort(proba_predictions) print("predictions: " + str(proba_predictions)) uncertainty = 1 - (proba_predictions[-1] - proba_predictions[-2]) loss = -1 * uncertainty # we want to maximize uncertainty except: pass return {'loss': loss, 'status': STATUS_OK, 'features': features}
print(model.coef_) #Print the coefficients for each independent variable. #But it is not clear which one corresponds to what. #SO let us print both column values and coefficients. #.Series is a 1-D labeled array capable of holding any data type. #Default index would be 0,1,2,3... but let us overwrite them with column names for X (independent variables) weights = pd.Series(model.coef_[0], index=X.columns.values) print("Weights for each variables is a follows...") print(weights) #+VE VALUE INDICATES THAT THE VARIABLE HAS A POSITIVE IMPACT''' baseline = model.score(X_test,y_test) import diffprivlib.models as dp dp_clf = dp.LogisticRegression() dp_clf.fit(X_train, y_train) print("Differentially private test accuracy (epsilon=%.2f): %.2f%%" % (dp_clf.epsilon, dp_clf.score(X_test, y_test) * 100)) dp_clf = dp.LogisticRegression(epsilon=float("inf"), data_norm=1e5) dp_clf.fit(X_train, y_train) print("Agreement between non-private and differentially private (epsilon=inf) classifiers: %.2f%%" % (dp_clf.score(X_test, model.predict(X_test)) * 100)) accuracy = [] epsilons = np.logspace(-3, 1, 500) for eps in epsilons: dp_clf = dp.LogisticRegression(epsilon=eps, data_norm=100)
min_fairness = 0.0 if most_uncertain_f['fairness_choice'][0]: min_fairness = most_uncertain_f['fairness_specified'][0] min_robustness = 0.0 if most_uncertain_f['robustness_choice'][0]: min_robustness = most_uncertain_f['robustness_specified'][0] max_number_features = X_train.shape[1] if most_uncertain_f['k_choice'][0]: max_number_features = most_uncertain_f['k_specified'][0] # Execute each search strategy with a given time limit (in parallel) # maybe run multiple times to smooth stochasticity model = LogisticRegression() if most_uncertain_f['privacy_choice'][0]: model = models.LogisticRegression( epsilon=most_uncertain_f['privacy_specified'][0]) mp_global.clf = model #define rankings rankings = [variance, chi2_score_wo] #simple rankings rankings.append( partial(model_score, estimator=ExtraTreesClassifier( n_estimators=1000))) #accuracy ranking rankings.append( partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking rankings.append( partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking
mp_global.y_train = y_train mp_global.y_test = y_test mp_global.names = [] mp_global.sensitive_ids = None mp_global.cv_splitter = cv_splitter min_accuracy = config['accuracy'] min_fairness = 0.0 min_robustness = config['robustness'] max_number_features = config['k'] max_search_time = time_limit model = LogisticRegression() if type(config['privacy']) != type(None): model = models.LogisticRegression(epsilon=config['privacy']) mp_global.clf = model #define rankings rankings = [ variance, chi2_score_wo, fcbf, my_fisher_score, mutual_info_classif, my_mcfs ] #rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking #rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking #rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking rankings.append(partial(model_score, estimator=ReliefF(n_neighbors=10))) # relieff mp_global.min_accuracy = min_accuracy mp_global.min_fairness = min_fairness
def get_estimated_best_strategy(self, X_train, y_train, min_accuracy, sensitive_ids, min_fairness, min_safety, privacy, max_complexity, max_search_time, classifier): start_time = time.time() selection_strategies = {} rankings = {} ranking_list = [ variance, chi2_score_wo, fcbf, my_fisher_score, mutual_info_classif, my_mcfs ] ranking_list.append( partial(model_score, estimator=ReliefF(n_neighbors=10))) for my_strategy in range(1, 8): selection_strategies[my_strategy] = weighted_ranking rankings[my_strategy] = [ranking_list[my_strategy - 1]] main_strategies = [ TPE, simulated_annealing, evolution, exhaustive, forward_selection, backward_selection, forward_floating_selection, backward_floating_selection, recursive_feature_elimination, fullfeatures ] for my_strategy in range(8, 18): selection_strategies[my_strategy] = main_strategies[my_strategy - 8] rankings[my_strategy] = None if isinstance(max_complexity, int): max_complexity = max_complexity / float(X_train.shape[1]) auc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True) cv_splitter = StratifiedKFold(5, random_state=42) X_train_tiny, _, y_train_tiny, _ = train_test_split(X_train, y_train, train_size=100, random_state=42, stratify=y_train) cv_k = 1.0 model = classifier if type(privacy) == type(None): privacy = X_train_tiny.shape[0] else: if isinstance(model, LogisticRegression): model = models.LogisticRegression(epsilon=privacy, class_weight='balanced') robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train_tiny, y=y_train_tiny, model=model, feature_selector=None, scorer=auc_scorer) small_start_time = time.time() scoring_dict = {'AUC': auc_scorer, 'Robustness': robust_scorer} if type(sensitive_ids) != type(None): fair_train_tiny = make_scorer( true_positive_rate_score, greater_is_better=True, sensitive_data=X_train_tiny[:, sensitive_ids[0]]) scoring_dict['Fairness'] = fair_train_tiny cv = GridSearchCV(model, param_grid={'C': [1.0]}, scoring=scoring_dict, refit=False, cv=cv_splitter) cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny)) cv_acc = cv.cv_results_['mean_test_AUC'][0] cv_fair = 0.0 if type(sensitive_ids) != type(None): cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0] cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0] cv_time = time.time() - small_start_time #cases if utility is defined min_fairness_new = min_fairness if min_fairness_new > 1.0: min_fairness_new = 1.0 min_accuracy_new = min_accuracy if min_accuracy_new > 1.0: min_accuracy_new = 1.0 min_safety_new = min_safety if min_accuracy_new > 1.0: min_safety_new = 1.0 max_complexity_new = max_complexity if max_complexity_new < 0.0: max_complexity_new = 0.0 # construct feature vector feature_list = [] # user-specified constraints feature_list.append(min_accuracy_new) feature_list.append(min_fairness_new) feature_list.append(max_complexity_new) feature_list.append(max_complexity_new * X_train.shape[1]) feature_list.append(min_safety_new) feature_list.append(privacy) feature_list.append(max_search_time) # differences to sample performance feature_list.append(cv_acc - min_accuracy_new) feature_list.append(cv_fair - min_fairness_new) feature_list.append(cv_k - max_complexity_new) feature_list.append((cv_k - max_complexity_new) * X_train.shape[1]) feature_list.append(cv_robust - min_safety_new) feature_list.append(cv_time) # privacy constraint is always satisfied => difference always zero => constant => unnecessary # metadata features feature_list.append(X_train.shape[0]) # number rows feature_list.append(X_train.shape[1]) # number columns feature_list.append(isinstance(model, DecisionTreeClassifier)) feature_list.append(isinstance(model, GaussianNB)) feature_list.append(isinstance(model, LogisticRegression)) self.features = np.array(feature_list).reshape(1, -1) self.predicted_probabilities = np.zeros(len(self.mappnames)) self.best_model = None best_score = -1 for my_strategy in range(len(self.mappnames)): self.predicted_probabilities[my_strategy] = self.models[ my_strategy].predict_proba(self.features)[:, 1] if self.predicted_probabilities[my_strategy] > best_score: best_score = self.predicted_probabilities[my_strategy] self.best_model = self.models[my_strategy] best_id = np.argmax(self.predicted_probabilities) self.selection_function = selection_strategies[best_id + 1] self.ranking_functions = rankings[best_id + 1] print("Within " + str(time.time() - start_time) + " seconds, the Optimizer chose to run " + str(self.mappnames[best_id + 1]))
def objective(hps): print(hps) try: cv_k = 1.0 cv_privacy = hps['privacy'] model = LogisticRegression(class_weight='balanced') if type(cv_privacy) == type(None): cv_privacy = X_train_tiny.shape[0] else: model = models.LogisticRegression(epsilon=cv_privacy, class_weight='balanced') robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train_tiny, y=y_train_tiny, model=model, feature_selector=None, scorer=auc_scorer) small_start_time = time.time() cv = GridSearchCV(model, param_grid={'C': [1.0]}, scoring={ 'AUC': auc_scorer, 'Fairness': fair_train_tiny, 'Robustness': robust_scorer }, refit=False, cv=cv_splitter) cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny)) cv_acc = cv.cv_results_['mean_test_AUC'][0] cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0] cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0] cv_time = time.time() - small_start_time #construct feature vector feature_list = [] #user-specified constraints feature_list.append(hps['accuracy']) feature_list.append(hps['fairness']) feature_list.append(hps['k']) feature_list.append(hps['k'] * X_train.shape[1]) feature_list.append(hps['robustness']) feature_list.append(cv_privacy) feature_list.append(hps['search_time']) #differences to sample performance feature_list.append(cv_acc - hps['accuracy']) feature_list.append(cv_fair - hps['fairness']) feature_list.append(cv_k - hps['k']) feature_list.append((cv_k - hps['k']) * X_train.shape[1]) feature_list.append(cv_robust - hps['robustness']) feature_list.append(cv_time) #privacy constraint is always satisfied => difference always zero => constant => unnecessary #metadata features feature_list.append(X_train.shape[0]) #number rows feature_list.append(X_train.shape[1]) #number columns features = np.array(feature_list) #predict the best model and calculate uncertainty loss = 0 return { 'loss': loss, 'status': STATUS_OK, 'features': features, 'search_time': hps['search_time'], 'constraints': hps } except: return {'loss': np.inf, 'status': STATUS_OK}
def objective(hps): print(hps) cv_k = 1.0 cv_privacy = hps['privacy'] model = LogisticRegression() if type(cv_privacy) == type(None): cv_privacy = X_train_tiny.shape[0] else: model = models.LogisticRegression(epsilon=cv_privacy) robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train_tiny, y=y_train_tiny, model=model, feature_selector=None, scorer=auc_scorer) cv = GridSearchCV(model, param_grid={'C': [1.0]}, scoring={ 'AUC': auc_scorer, 'Fairness': fair_train_tiny, 'Robustness': robust_scorer }, refit=False, cv=cv_splitter) cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny)) cv_acc = cv.cv_results_['mean_test_AUC'][0] cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0] cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0] #construct feature vector feature_list = [] #user-specified constraints feature_list.append(hps['accuracy']) feature_list.append(hps['fairness']) feature_list.append(hps['k']) feature_list.append(hps['robustness']) feature_list.append(cv_privacy) #differences to sample performance feature_list.append(cv_acc - hps['accuracy']) feature_list.append(cv_fair - hps['fairness']) feature_list.append(cv_k - hps['k']) feature_list.append(cv_robust - hps['robustness']) #privacy constraint is always satisfied => difference always zero => constant => unnecessary #metadata features #feature_list.append(X_train.shape[0])#number rows #feature_list.append(X_train.shape[1])#number columns features = np.array(feature_list) #predict the best model and calculate uncertainty loss = 0 if hasattr(meta_classifier, 'estimators_'): predictions = [] for tree in range(len(meta_classifier.estimators_)): predictions.append(meta_classifier.estimators_[tree].predict( [features])[0]) stddev = np.std(np.array(predictions), axis=0) print("hello2") print(stddev.shape) loss = np.sum(stddev**2) * -1 return {'loss': loss, 'status': STATUS_OK, 'features': features}
mp_global.X_train_val = X_train_val mp_global.y_validation = y_val mp_global.y_train_val = y_train_val min_accuracy = config['accuracy'] min_fairness = 0.0 min_robustness = config['robustness'] max_number_features = config['k'] max_search_time = time_limit model = None if config['model'] == 'Logistic Regression': model = LogisticRegression(class_weight='balanced') if type(config['privacy']) != type(None): model = models.LogisticRegression(epsilon=config['privacy'], class_weight='balanced') elif config['model'] == 'Gaussian Naive Bayes': model = GaussianNB() if type(config['privacy']) != type(None): model = models.GaussianNB(epsilon=config['privacy']) elif config['model'] == 'Decision Tree': model = DecisionTreeClassifier(class_weight='balanced') if type(config['privacy']) != type(None): model = PrivateRandomForest(n_estimators=1, epsilon=config['privacy']) mp_global.clf = model # define rankings rankings = [ variance, chi2_score_wo, fcbf, my_fisher_score, mutual_info_classif, my_mcfs
for min_accuracy in [0.5, 0.53, 0.56, 0.59, 0.62, 0.65, 0.68]: for privacy in [10.0, 7.0, 3.0, 1.0, 0.7, 0.3, 0.1, 0.07, 0.03]: success_per_strategy = np.zeros(18) time_per_strategy = np.zeros(18) for nruns_global in range(5): min_robustness = 0.0 max_search_time = 20 * 60 min_fairness = 0.0 max_number_features = 1.0 # Execute each search strategy with a given time limit (in parallel) # maybe run multiple times to smooth stochasticity model = models.LogisticRegression(epsilon=privacy, class_weight='balanced') mp_global.clf = model mp_global.model_hyperparameters = { 'C': [0.001, 0.01, 0.1, 1.0, 10, 100, 1000] } mp_global.model_hyperparameters['epsilon'] = [privacy] #define rankings rankings = [ variance, chi2_score_wo, fcbf, my_fisher_score, mutual_info_classif, my_mcfs ] #rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking #rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking #rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking rankings.append(
def maximize_uncertainty(hps): print(hps) X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation( dataset_key=hps['data']) is_regression = False # run on tiny sample if X_train.shape[0] > 100: if is_regression: X_train_tiny, _, y_train_tiny, _ = train_test_split( X_train, y_train, train_size=100, random_state=42) else: X_train_tiny, _, y_train_tiny, _ = train_test_split( X_train, y_train, train_size=100, random_state=42, stratify=y_train) else: X_train_tiny = X_train y_train_tiny = y_train print(X_train.shape) if type(sensitive_ids) != type(None): fair_train_tiny = make_scorer( true_positive_rate_score, greater_is_better=True, sensitive_data=X_train_tiny[:, sensitive_ids[0]]) mp_global.X_train = X_train mp_global.X_validation = X_validation mp_global.X_train_val = X_train_val mp_global.X_test = X_test mp_global.y_train = y_train mp_global.y_validation = y_validation mp_global.y_train_val = y_train_val mp_global.y_test = y_test mp_global.names = names mp_global.sensitive_ids = sensitive_ids mp_global.cv_splitter = StratifiedKFold(5, random_state=42) mp_global.accuracy_scorer = make_scorer(f1_score) mp_global.avoid_robustness = False cv_k = 1.0 cv_privacy = hps['privacy'] model = LogisticRegression(class_weight='balanced') if type(cv_privacy) != type(None): model = models.LogisticRegression(epsilon=cv_privacy, class_weight='balanced') if type(cv_privacy) == type(None): cv_privacy = X_train_tiny.shape[0] robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train_tiny, y=y_train_tiny, model=model, feature_selector=None, scorer=mp_global.accuracy_scorer) small_start_time = time.time() scoring = {'AUC': mp_global.accuracy_scorer} if not mp_global.avoid_robustness: scoring['Robustness'] = robust_scorer if type(sensitive_ids) != type(None): scoring['Fairness'] = fair_train_tiny cv = GridSearchCV(model, param_grid={}, scoring=scoring, refit=False, cv=mp_global.cv_splitter) cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny)) cv_acc = cv.cv_results_['mean_test_AUC'][0] cv_fair = 0.0 if type(sensitive_ids) != type(None): cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0] cv_robust = 0.0 if not mp_global.avoid_robustness: cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0] cv_time = time.time() - small_start_time # construct feature vector feature_list = [] # user-specified constraints feature_list.append(hps['accuracy']) feature_list.append(hps['fairness']) feature_list.append(hps['k']) feature_list.append(hps['k'] * X_train.shape[1]) feature_list.append(hps['robustness']) feature_list.append(cv_privacy) feature_list.append(hps['search_time']) # differences to sample performance feature_list.append(cv_acc - hps['accuracy']) feature_list.append(cv_fair - hps['fairness']) feature_list.append(cv_k - hps['k']) feature_list.append((cv_k - hps['k']) * X_train.shape[1]) feature_list.append(cv_robust - hps['robustness']) feature_list.append(cv_time) # privacy constraint is always satisfied => difference always zero => constant => unnecessary # metadata features feature_list.append(X_train.shape[0]) # number rows feature_list.append(X_train.shape[1]) # number columns #models feature_list.append(hps['model'] == 'Decision Tree') feature_list.append(hps['model'] == 'Gaussian Naive Bayes') feature_list.append(hps['model'] == 'Logistic Regression') features = np.array(feature_list).reshape(1, -1) # predict the best model and calculate uncertainty print(features) #now predict with models aggregated_certainty = 0 print("uncertainty") for model_i in range(len(all_current_models)): certainty = np.abs( all_current_models[model_i].predict_proba(features)[0, 0] - 0.5) aggregated_certainty += certainty print('Certainty: ' + str(aggregated_certainty)) return { 'loss': aggregated_certainty, 'status': STATUS_OK, 'features': features, 'search_time': hps['search_time'], 'constraints': hps }
for fname_i in range(len(all_names)): if all_names[fname_i].startswith('onehot__x' + str(cat_sensitive_attribute_id) + '_'): sensitive_ids.append(fname_i) print(sensitive_ids) le = preprocessing.LabelEncoder() le.fit(y_train) y_train = le.fit_transform(y_train) y_test = le.transform(y_test) cv_splitter = StratifiedKFold(5, random_state=42) model = models.LogisticRegression() start_time = time.time() evolution(X_train, X_test, y_train, y_test, names, sensitive_ids, ranking_functions=[chi2_score_wo], clf=model, min_accuracy=0.85, min_fairness=0.86, min_robustness=0.80, max_number_features=0.2,