cv_splitter = StratifiedKFold(5, random_state=42) auc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True) os.mkdir('/tmp/experiment' + str(current_run_time_id)) run_counter = 0 while True: #create folder to store files: os.mkdir('/tmp/experiment' + str(current_run_time_id) + '/run' + str(run_counter)) X_train, X_validation, X_test, y_train, y_validation, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation( dataset_key='1590') #run on tiny sample if X_train.shape[0] > 100: X_train_tiny, _, y_train_tiny, _ = train_test_split(X_train, y_train, train_size=100, random_state=42, stratify=y_train) else: X_train_tiny = X_train y_train_tiny = y_train fair_train_tiny = make_scorer( true_positive_rate_score, greater_is_better=True,
str(run_counter) + '/strategy' + str(conf['strategy_id']) + '.pickle', accuracy_scorer=mp_global.accuracy_scorer, model_hyperparameters=mp_global.model_hyperparameters) result['strategy_id'] = conf['strategy_id'] return result current_run_time_id = time.time() time_limit = 60 * 60 * 3 number_of_runs = 1 run_counter = 0 while True: X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation( ) #X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id, is_regression = get_fair_data1_validation_openml() is_regression = False #run on tiny sample if X_train.shape[0] > 100: if is_regression: X_train_tiny, _, y_train_tiny, _ = train_test_split( X_train, y_train, train_size=100, random_state=42) else: X_train_tiny, _, y_train_tiny, _ = train_test_split( X_train, y_train, train_size=100, random_state=42, stratify=y_train)
def maximize_uncertainty(hps): print(hps) X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation( dataset_key=hps['data']) is_regression = False # run on tiny sample if X_train.shape[0] > 100: if is_regression: X_train_tiny, _, y_train_tiny, _ = train_test_split( X_train, y_train, train_size=100, random_state=42) else: X_train_tiny, _, y_train_tiny, _ = train_test_split( X_train, y_train, train_size=100, random_state=42, stratify=y_train) else: X_train_tiny = X_train y_train_tiny = y_train print(X_train.shape) if type(sensitive_ids) != type(None): fair_train_tiny = make_scorer( true_positive_rate_score, greater_is_better=True, sensitive_data=X_train_tiny[:, sensitive_ids[0]]) mp_global.X_train = X_train mp_global.X_validation = X_validation mp_global.X_train_val = X_train_val mp_global.X_test = X_test mp_global.y_train = y_train mp_global.y_validation = y_validation mp_global.y_train_val = y_train_val mp_global.y_test = y_test mp_global.names = names mp_global.sensitive_ids = sensitive_ids mp_global.cv_splitter = StratifiedKFold(5, random_state=42) mp_global.accuracy_scorer = make_scorer(f1_score) mp_global.avoid_robustness = False cv_k = 1.0 cv_privacy = hps['privacy'] model = LogisticRegression(class_weight='balanced') if type(cv_privacy) != type(None): model = models.LogisticRegression(epsilon=cv_privacy, class_weight='balanced') if type(cv_privacy) == type(None): cv_privacy = X_train_tiny.shape[0] robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train_tiny, y=y_train_tiny, model=model, feature_selector=None, scorer=mp_global.accuracy_scorer) small_start_time = time.time() scoring = {'AUC': mp_global.accuracy_scorer} if not mp_global.avoid_robustness: scoring['Robustness'] = robust_scorer if type(sensitive_ids) != type(None): scoring['Fairness'] = fair_train_tiny cv = GridSearchCV(model, param_grid={}, scoring=scoring, refit=False, cv=mp_global.cv_splitter) cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny)) cv_acc = cv.cv_results_['mean_test_AUC'][0] cv_fair = 0.0 if type(sensitive_ids) != type(None): cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0] cv_robust = 0.0 if not mp_global.avoid_robustness: cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0] cv_time = time.time() - small_start_time # construct feature vector feature_list = [] # user-specified constraints feature_list.append(hps['accuracy']) feature_list.append(hps['fairness']) feature_list.append(hps['k']) feature_list.append(hps['k'] * X_train.shape[1]) feature_list.append(hps['robustness']) feature_list.append(cv_privacy) feature_list.append(hps['search_time']) # differences to sample performance feature_list.append(cv_acc - hps['accuracy']) feature_list.append(cv_fair - hps['fairness']) feature_list.append(cv_k - hps['k']) feature_list.append((cv_k - hps['k']) * X_train.shape[1]) feature_list.append(cv_robust - hps['robustness']) feature_list.append(cv_time) # privacy constraint is always satisfied => difference always zero => constant => unnecessary # metadata features feature_list.append(X_train.shape[0]) # number rows feature_list.append(X_train.shape[1]) # number columns #models feature_list.append(hps['model'] == 'Decision Tree') feature_list.append(hps['model'] == 'Gaussian Naive Bayes') feature_list.append(hps['model'] == 'Logistic Regression') features = np.array(feature_list).reshape(1, -1) # predict the best model and calculate uncertainty print(features) #now predict with models aggregated_certainty = 0 print("uncertainty") for model_i in range(len(all_current_models)): certainty = np.abs( all_current_models[model_i].predict_proba(features)[0, 0] - 0.5) aggregated_certainty += certainty print('Certainty: ' + str(aggregated_certainty)) return { 'loss': aggregated_certainty, 'status': STATUS_OK, 'features': features, 'search_time': hps['search_time'], 'constraints': hps }
needs_threshold=True) mp_global.X_train = [] mp_global.X_validation = [] mp_global.X_train_val = [] mp_global.X_test = [] mp_global.y_train = [] mp_global.y_validation = [] mp_global.y_train_val = [] mp_global.y_test = [] mp_global.names = [] mp_global.sensitive_ids = [] mp_global.cv_splitter = [] for nruns in range(5): X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation( dataset_key='1590', random_number=42 + nruns) mp_global.X_train.append(X_train) mp_global.X_validation.append(X_validation) mp_global.X_train_val.append(X_train_val) mp_global.X_test.append(X_test) mp_global.y_train.append(y_train) mp_global.y_validation.append(y_validation) mp_global.y_train_val.append(y_train_val) mp_global.y_test.append(y_test) mp_global.names.append(names) mp_global.sensitive_ids.append(sensitive_ids) mp_global.cv_splitter.append(cv_splitter) mp_global.accuracy_scorer = make_scorer(f1_score) mp_global.avoid_robustness = False
def uncertainty_sampling(training_dataset_ids, all_current_models): time_limit = 30 * 60 #60 * 60 * 3 training_dataset_ids = training_dataset_ids.tolist() if '1240' in training_dataset_ids: training_dataset_ids.remove('1240') if '42132' in training_dataset_ids: training_dataset_ids.remove('42132') def maximize_uncertainty(hps): print(hps) X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation( dataset_key=hps['data']) is_regression = False # run on tiny sample if X_train.shape[0] > 100: if is_regression: X_train_tiny, _, y_train_tiny, _ = train_test_split( X_train, y_train, train_size=100, random_state=42) else: X_train_tiny, _, y_train_tiny, _ = train_test_split( X_train, y_train, train_size=100, random_state=42, stratify=y_train) else: X_train_tiny = X_train y_train_tiny = y_train print(X_train.shape) if type(sensitive_ids) != type(None): fair_train_tiny = make_scorer( true_positive_rate_score, greater_is_better=True, sensitive_data=X_train_tiny[:, sensitive_ids[0]]) mp_global.X_train = X_train mp_global.X_validation = X_validation mp_global.X_train_val = X_train_val mp_global.X_test = X_test mp_global.y_train = y_train mp_global.y_validation = y_validation mp_global.y_train_val = y_train_val mp_global.y_test = y_test mp_global.names = names mp_global.sensitive_ids = sensitive_ids mp_global.cv_splitter = StratifiedKFold(5, random_state=42) mp_global.accuracy_scorer = make_scorer(f1_score) mp_global.avoid_robustness = False cv_k = 1.0 cv_privacy = hps['privacy'] model = LogisticRegression(class_weight='balanced') if type(cv_privacy) != type(None): model = models.LogisticRegression(epsilon=cv_privacy, class_weight='balanced') if type(cv_privacy) == type(None): cv_privacy = X_train_tiny.shape[0] robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train_tiny, y=y_train_tiny, model=model, feature_selector=None, scorer=mp_global.accuracy_scorer) small_start_time = time.time() scoring = {'AUC': mp_global.accuracy_scorer} if not mp_global.avoid_robustness: scoring['Robustness'] = robust_scorer if type(sensitive_ids) != type(None): scoring['Fairness'] = fair_train_tiny cv = GridSearchCV(model, param_grid={}, scoring=scoring, refit=False, cv=mp_global.cv_splitter) cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny)) cv_acc = cv.cv_results_['mean_test_AUC'][0] cv_fair = 0.0 if type(sensitive_ids) != type(None): cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0] cv_robust = 0.0 if not mp_global.avoid_robustness: cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0] cv_time = time.time() - small_start_time # construct feature vector feature_list = [] # user-specified constraints feature_list.append(hps['accuracy']) feature_list.append(hps['fairness']) feature_list.append(hps['k']) feature_list.append(hps['k'] * X_train.shape[1]) feature_list.append(hps['robustness']) feature_list.append(cv_privacy) feature_list.append(hps['search_time']) # differences to sample performance feature_list.append(cv_acc - hps['accuracy']) feature_list.append(cv_fair - hps['fairness']) feature_list.append(cv_k - hps['k']) feature_list.append((cv_k - hps['k']) * X_train.shape[1]) feature_list.append(cv_robust - hps['robustness']) feature_list.append(cv_time) # privacy constraint is always satisfied => difference always zero => constant => unnecessary # metadata features feature_list.append(X_train.shape[0]) # number rows feature_list.append(X_train.shape[1]) # number columns #models feature_list.append(hps['model'] == 'Decision Tree') feature_list.append(hps['model'] == 'Gaussian Naive Bayes') feature_list.append(hps['model'] == 'Logistic Regression') features = np.array(feature_list).reshape(1, -1) # predict the best model and calculate uncertainty print(features) #now predict with models aggregated_certainty = 0 print("uncertainty") for model_i in range(len(all_current_models)): certainty = np.abs( all_current_models[model_i].predict_proba(features)[0, 0] - 0.5) aggregated_certainty += certainty print('Certainty: ' + str(aggregated_certainty)) return { 'loss': aggregated_certainty, 'status': STATUS_OK, 'features': features, 'search_time': hps['search_time'], 'constraints': hps } space = { 'data': hp.choice('data_choice', training_dataset_ids), 'model': hp.choice( 'model_choice', [ 'Logistic Regression', 'Gaussian Naive Bayes', 'Decision Tree' # , 'Random Forest' ]), 'k': hp.choice('k_choice', [(1.0), (hp.uniform('k_specified', 0, 1))]), 'accuracy': hp.uniform('accuracy_specified', 0.5, 1), 'fairness': hp.choice('fairness_choice', [(0.0), (hp.uniform('fairness_specified', 0.8, 1))]), 'privacy': hp.choice('privacy_choice', [(None), (hp.lognormal('privacy_specified', 0, 1))]), 'robustness': hp.choice('robustness_choice', [(0.0), (hp.uniform('robustness_specified', 0.8, 1))]), 'search_time': hp.uniform('search_time_specified', 10, time_limit), # in seconds } trials = Trials() fmin(maximize_uncertainty, space=space, algo=tpe.suggest, max_evals=100, trials=trials, show_progressbar=True) ### now run most uncertain trial number_of_runs = 1 # break, once convergence tolerance is reached and generate new dataset last_trial = trials.best_trial most_uncertain_f = last_trial['misc']['vals'] # print(most_uncertain_f) ##specifiy data run_counter = 0 current_run_time_id = time.time() path = pathlib.Path('/tmp/experiment_uncertainty/run' + str(0)) path.mkdir(parents=True, exist_ok=True) selected_dataset_id = training_dataset_ids[most_uncertain_f['data_choice'] [0]] X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation( dataset_key=selected_dataset_id) is_regression = False mp_global.X_train = X_train mp_global.X_validation = X_validation mp_global.X_train_val = X_train_val mp_global.X_test = X_test mp_global.y_train = y_train mp_global.y_validation = y_validation mp_global.y_train_val = y_train_val mp_global.y_test = y_test mp_global.names = names mp_global.sensitive_ids = sensitive_ids if is_regression: mp_global.cv_splitter = KFold(5, random_state=42) mp_global.accuracy_scorer = make_scorer(r2_score) else: mp_global.cv_splitter = StratifiedKFold(5, random_state=42) mp_global.accuracy_scorer = make_scorer(f1_score) mp_global.avoid_robustness = False min_accuracy = most_uncertain_f['accuracy_specified'][0] min_fairness = 0.0 if most_uncertain_f['fairness_choice'][0]: min_fairness = most_uncertain_f['fairness_specified'][0] min_robustness = 0.0 if most_uncertain_f['robustness_choice'][0]: min_robustness = most_uncertain_f['robustness_specified'][0] max_number_features = 1.0 if most_uncertain_f['k_choice'][0]: max_number_features = most_uncertain_f['k_specified'][0] max_search_time = most_uncertain_f['search_time_specified'][0] # Execute each search strategy with a given time limit (in parallel) # maybe run multiple times to smooth stochasticity model = None print(most_uncertain_f) if most_uncertain_f['model_choice'][0] == 0: model = LogisticRegression(class_weight='balanced') if most_uncertain_f['privacy_choice'][0]: model = models.LogisticRegression( epsilon=most_uncertain_f['privacy_specified'][0], class_weight='balanced') elif most_uncertain_f['model_choice'][0] == 1: model = GaussianNB() if most_uncertain_f['privacy_choice'][0]: model = models.GaussianNB( epsilon=most_uncertain_f['privacy_specified'][0]) elif most_uncertain_f['model_choice'][0] == 2: model = DecisionTreeClassifier(class_weight='balanced') if most_uncertain_f['privacy_choice'][0]: model = PrivateRandomForest( n_estimators=1, epsilon=most_uncertain_f['privacy_specified'][0]) elif most_uncertain_f['model_choice'][0] == 3: model = RandomForestClassifier(n_estimators=100, class_weight='balanced') if most_uncertain_f['privacy_choice'][0]: model = PrivateRandomForest( n_estimators=100, epsilon=most_uncertain_f['privacy_specified'][0]) print(model) mp_global.clf = model # define rankings rankings = [ variance, chi2_score_wo, fcbf, my_fisher_score, mutual_info_classif, my_mcfs ] # rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking # rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking # rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking rankings.append(partial(model_score, estimator=ReliefF(n_neighbors=10))) # relieff mp_global.min_accuracy = min_accuracy mp_global.min_fairness = min_fairness mp_global.min_robustness = min_robustness mp_global.max_number_features = max_number_features mp_global.max_search_time = max_search_time mp_global.configurations = [] # add single rankings strategy_id = 1 for r in range(len(rankings)): for run in range(number_of_runs): configuration = {} configuration['ranking_functions'] = copy.deepcopy([rankings[r]]) configuration['run_id'] = copy.deepcopy(run) configuration['main_strategy'] = copy.deepcopy(weighted_ranking) configuration['strategy_id'] = copy.deepcopy(strategy_id) mp_global.configurations.append(configuration) strategy_id += 1 main_strategies = [ TPE, simulated_annealing, evolution, exhaustive, forward_selection, backward_selection, forward_floating_selection, backward_floating_selection, recursive_feature_elimination, fullfeatures ] # run main strategies for strategy in main_strategies: for run in range(number_of_runs): configuration = {} configuration['ranking_functions'] = [] configuration['run_id'] = copy.deepcopy(run) configuration['main_strategy'] = copy.deepcopy(strategy) configuration['strategy_id'] = copy.deepcopy(strategy_id) mp_global.configurations.append(configuration) strategy_id += 1 # 6#17 with ProcessPool(max_workers=17) as pool: future = pool.map(my_function, range(len(mp_global.configurations)), timeout=max_search_time) iterator = future.result() while True: try: result = next(iterator) except StopIteration: break except TimeoutError as error: print("function took longer than %d seconds" % error.args[1]) except ProcessExpired as error: print("%s. Exit code: %d" % (error, error.exitcode)) except Exception as error: print("function raised %s" % error) print(error.traceback) # Python's traceback of remote process #check which strategies were successful mappnames = { 1: 'TPE(Variance)', 2: 'TPE($\chi^2$)', 3: 'TPE(FCBF)', 4: 'TPE(Fisher)', 5: 'TPE(MIM)', 6: 'TPE(MCFS)', 7: 'TPE(ReliefF)', 8: 'TPE(NR)', 9: 'SA(NR)', 10: 'NSGA-II(NR)', 11: 'ES(NR)', 12: 'SFS(NR)', 13: 'SBS(NR)', 14: 'SFFS(NR)', 15: 'SBFS(NR)', 16: 'RFE(LR)', 17: 'Complete Set' } def load_pickle(fname): data = [] with open(fname, "rb") as f: while True: try: data.append(pickle.load(f)) except EOFError: break return data def is_successfull_validation_and_test(exp_results): return len(exp_results ) > 0 and 'success_test' in exp_results[-1] and exp_results[ -1]['success_test'] == True # also on test satisfied def is_successfull_validation(exp_results): return len(exp_results) > 0 and 'Validation_Satisfied' in exp_results[ -1] # constraints were satisfied on validation set run_strategies_success_test = {} run_strategies_times = {} run_strategies_success_validation = {} rfolder = '/tmp/experiment_uncertainty/run' + str(0) + '/' validation_satisfied_by_any_strategy = False min_time = np.inf best_strategy = 0 for s in range(1, len(mappnames) + 1): exp_results = [] try: exp_results = load_pickle(rfolder + 'strategy' + str(s) + '.pickle') except: pass if is_successfull_validation_and_test(exp_results): runtime = exp_results[-1]['final_time'] if runtime < min_time: min_time = runtime best_strategy = s run_strategies_success_test[s] = True run_strategies_times[s] = runtime else: run_strategies_success_test[s] = False run_strategies_success_validation[s] = is_successfull_validation( exp_results) if run_strategies_success_validation[s]: validation_satisfied_by_any_strategy = True strategy_success = np.zeros((1, len(mappnames))) for c_i in range(len(mappnames)): strategy_success[0, c_i] = run_strategies_success_test[c_i + 1] return last_trial['result']['features'], strategy_success