def execute_feature_combo1(feature_combo, feature_combo_id=0, params=pARAMS): mask = np.zeros(mp_globalsfs.data_per_fold[0][0].shape[1], dtype=bool) for fc in feature_combo: mask[fc] = True mp_globalsfs.mask = mask hyperparameter_search_scores = [] for c in params: pipeline = Pipeline([('imputation', SimpleImputer()), ('selection', MaskSelection(mask)), get_model(c)]) mp_globalsfs.parameter = c cv_scores = [] with Pool(processes=multiprocessing.cpu_count()) as p: cv_scores = list( tqdm.tqdm(p.imap(run_fold, range(len(mp_globalsfs.data_per_fold))), total=len(mp_globalsfs.data_per_fold))) hyperparameter_search_scores.append(np.mean(cv_scores)) return (feature_combo_id, np.max(hyperparameter_search_scores), params[np.argmax(hyperparameter_search_scores)])
def f_clf1(mask): # Assembing pipeline model = Pipeline([('scale', MinMaxScaler()), ('selection', MaskSelection(mask)), ('clf', LogisticRegression())]) return model
def objective(features): model = Pipeline([('selection', MaskSelection(features)), ('clf', LogisticRegression())]) robust_scorer = make_scorer( robust_score, greater_is_better=True, X=X_train, y=y_train, feature_selector=model.named_steps['selection']) robust_scorer_test = make_scorer( robust_score_test, greater_is_better=True, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, feature_selector=model.named_steps['selection']) ncv = 5 cv_acc = np.mean( cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=auc_scorer)) cv_fair = 1.0 - np.mean( cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=fair_train)) cv_robust = 1.0 - np.mean( cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=robust_scorer)) #cv_robust = 1.0 print('cv acc: ' + str(cv_acc) + ' cv fair: ' + str(cv_fair) + ' cv robust: ' + str(cv_robust)) model.fit(X_train, pd.DataFrame(y_train)) test_acc = auc_scorer(model, X_test, pd.DataFrame(y_test)) test_fair = 1.0 - fair_test(model, X_test, pd.DataFrame(y_test)) test_robust = 1.0 - robust_scorer_test(model, X_test, pd.DataFrame(y_test)) simplicity = -1 * np.sum(features) my_global_variable.satisfied_constraints.append( [min(test_acc, cv_acc), min(test_fair, cv_fair)]) my_global_variable.times.append(time.time() - start_time) my_global_variable.iterations.append(my_global_variable.current_iteration) my_global_variable.current_iteration += 1 return [cv_acc, cv_fair, cv_robust, simplicity]
def f_clf1(hps): mask = np.zeros(len(hps), dtype=bool) for k, v in hps.items(): mask[int(k.split('_')[1])] = v for mask_i in range(len(mask)): hps['f_' + str(mask_i)] = mask[mask_i] model = Pipeline([('selection', MaskSelection(mask)), ('clf', clf)]) return model, hps
def run_fold(cv): mask = mp_globalsfs.mask c = mp_globalsfs.parameter pipeline = Pipeline([('imputation', SimpleImputer()), ('selection', MaskSelection(mask)), get_model(c)]) X_train, y_train, X_test, y_test = mp_globalsfs.data_per_fold[cv] pipeline.fit(X_train, pd.DataFrame(y_train)) return auc_scorer(pipeline, X_test, y_test)
def get_test_auc(feature_combo, X_train_transformed, y_train, X_test_transformed, y_test, hyperparam): mask = np.zeros(mp_globalsfs.data_per_fold[0][0].shape[1], dtype=bool) for fc in feature_combo: mask[fc] = True pipeline = Pipeline([('selection', MaskSelection(mask)), get_model_test(hyperparam)]) pipeline.fit(X_train_transformed, y_train) return auc_scorer(pipeline, X_test_transformed, y_test)
def f_to_min1(): mask = np.array([True for f_i in range(X_train.shape[1])]) pipeline = Pipeline([ ('selection', MaskSelection(mask)), ('clf', clf) ]) grid_result = run_grid_search(pipeline, X_train, y_train, X_validation, y_validation, accuracy_scorer, sensitive_ids, min_fairness, min_accuracy, min_robustness, max_number_features, model_hyperparameters=model_hyperparameters, start_time=start_time) return grid_result
def get_test_auc_and_coeff(feature_combo, X_train_transformed, y_train, X_test_transformed, y_test, hyperparam): mask = np.zeros(mp_globalsfs.data_per_fold[0][0].shape[1], dtype=bool) for fc in feature_combo: mask[fc] = True pipeline = Pipeline([ ('imputation', SimpleImputer()), ('selection', MaskSelection(mask)), get_model(hyperparam) ]) pipeline.fit(X_train_transformed, y_train) return auc_scorer(pipeline, X_test_transformed, y_test), pipeline.named_steps['clf'].coef_[0]
def get_test_auc(feature_combo, X_train_transformed, y_train, X_test_transformed, y_test, hyperparam): mask = np.zeros(mp_globalsfs.data_per_fold[0][0].shape[1], dtype=bool) for fc in feature_combo: mask[fc] = True pipeline = Pipeline([('imputation', SimpleImputer()), ('selection', MaskSelection(mask)), get_model(hyperparam)]) pipeline.fit(X_train_transformed, y_train) # save model pickle.dump(pipeline, open("/tmp/pipeline.p", "wb")) return auc_scorer(pipeline, X_test_transformed, y_test)
def f_to_min1(): mask = np.array([True for f_i in range(X_train.shape[1])]) pipeline = Pipeline([('selection', MaskSelection(mask)), ('clf', clf)]) pipeline.fit(X_train, y_train) validation_number_features = 1.0 validation_acc = auc_scorer(pipeline, X_validation, pd.DataFrame(y_validation)) validation_fair = 0.0 if type(sensitive_ids) != type(None) and min_fairness > 0.0: validation_fair = 1.0 - fair_validation(pipeline, X_validation, pd.DataFrame(y_validation)) validation_robust = 0.0 if min_robustness > 0.0: validation_robust = 1.0 - robust_score_test( eps=0.1, X_test=X_validation, y_test=y_validation, model=pipeline.named_steps['clf'], feature_selector=pipeline.named_steps['selection'], scorer=auc_scorer) loss = 0.0 if min_fairness > 0.0 and validation_fair < min_fairness: loss += (min_fairness - validation_fair)**2 if min_accuracy > 0.0 and validation_acc < min_accuracy: loss += (min_accuracy - validation_acc)**2 if min_robustness > 0.0 and validation_robust < min_robustness: loss += (min_robustness - validation_robust)**2 print(loss) current_time = time.time() - start_time return { 'loss': loss, 'model': pipeline, 'cv_fair': validation_fair, 'cv_acc': validation_acc, 'cv_robust': validation_robust, 'cv_number_features': validation_number_features, 'time': current_time }
def f_clf1(hps): mask = np.zeros(len(hps), dtype=bool) for k, v in hps.items(): mask[int(k.split('_')[1])] = v #repair number of features if neccessary max_k = max(int(max_number_features * X_train.shape[1]), 1) if np.sum(mask) > max_k: id_features_used = np.nonzero(mask)[ 0] # indices where features are used np.random.shuffle(id_features_used) # shuffle ids ids_tb_deactived = id_features_used[max_k:] # deactivate features for item_to_remove in ids_tb_deactived: mask[item_to_remove] = False for mask_i in range(len(mask)): hps['f_' + str(mask_i)] = mask[mask_i] pipeline = Pipeline([('selection', MaskSelection(mask)), ('clf', clf)]) return pipeline, hps
def execute_feature_combo1(feature_combo, feature_combo_id=0, params=pARAMS): mask = np.zeros(mp_globalsfs.data_per_fold[0][0].shape[1], dtype=bool) for fc in feature_combo: mask[fc] = True hyperparameter_search_scores = [] for c in params: pipeline = Pipeline([('imputation', SimpleImputer()), ('selection', MaskSelection(mask)), get_model(c)]) cv_scores = [] for cv in range(len(mp_globalsfs.data_per_fold)): X_train, y_train, X_test, y_test = mp_globalsfs.data_per_fold[cv] pipeline.fit(X_train, pd.DataFrame(y_train)) cv_scores.append(auc_scorer(pipeline, X_test, y_test)) hyperparameter_search_scores.append(np.mean(cv_scores)) return (feature_combo_id, np.max(hyperparameter_search_scores), params[np.argmax(hyperparameter_search_scores)])
def objective(features): model = Pipeline([ ('selection', MaskSelection(features)), ('clf', LogisticRegression()) ]) robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train, y=y_train, feature_selector=model.named_steps['selection']) robust_scorer_test = make_scorer(robust_score_test, greater_is_better=True, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, feature_selector=model.named_steps['selection']) ncv = 5 cv_acc = np.mean( cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=auc_scorer)) cv_fair = 1.0 - np.mean(cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=fair_train)) cv_robust = 1.0 - np.mean(cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=robust_scorer)) #cv_robust = 1.0 print('cv acc: ' + str(cv_acc) + ' cv fair: ' + str(cv_fair) + ' cv robust: ' + str(cv_robust)) if cv_acc > min_accuracy and cv_fair > min_fairness and cv_robust > min_robustness: model.fit(X_train, pd.DataFrame(y_train)) test_acc = auc_scorer(model, X_test, pd.DataFrame(y_test)) test_fair = 1.0 - fair_test(model, X_test, pd.DataFrame(y_test)) test_robust = 1.0 - robust_scorer_test(model, X_test, pd.DataFrame(y_test)) print('acc: ' + str(test_acc) + ' fair: ' + str(test_fair) + ' robust: ' + str(test_robust)) if test_acc > min_accuracy and test_fair > min_fairness and test_robust > min_robustness: my_global_variable.global_check = True print("selected features: " + str(np.array(names)[features])) simplicity = -1 * np.sum(features) #change objectives #cv_acc = 1.0 return [cv_acc, cv_fair, cv_robust, simplicity]
def objective(hps): mask = np.zeros(len(hps), dtype=bool) for k, v in hps.items(): mask[int(k.split('_')[1])] = v model = Pipeline([('selection', MaskSelection(mask)), ('clf', LogisticRegression())]) robust_scorer = make_scorer( robust_score, greater_is_better=True, X=X_train, y=y_train, feature_selector=model.named_steps['selection']) robust_scorer_test = make_scorer( robust_score_test, greater_is_better=True, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, feature_selector=model.named_steps['selection']) ncv = 5 cv_acc = np.mean( cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=auc_scorer)) cv_fair = 1.0 - np.mean( cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=fair_train)) cv_robust = 1.0 - np.mean( cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=robust_scorer)) #cv_robust = 1.0 print('cv acc: ' + str(cv_acc) + ' cv fair: ' + str(cv_fair) + ' cv robust: ' + str(cv_robust)) cv_model_gen = 1.0 loss = 0.0 if cv_acc >= min_accuracy and cv_fair >= min_fairness and cv_robust >= min_robustness and cv_model_gen >= min_avg_model_accuracy: loss = (min_accuracy - cv_acc) + (min_fairness - cv_fair) + ( min_robustness - cv_robust) + (min_avg_model_accuracy - cv_model_gen) else: if cv_fair < min_fairness: loss += (min_fairness - cv_fair)**2 if cv_acc < min_accuracy: loss += (min_accuracy - cv_acc)**2 if cv_robust < min_robustness: loss += (min_robustness - cv_robust)**2 if cv_model_gen < min_avg_model_accuracy: loss += (min_avg_model_accuracy - cv_model_gen)**2 return { 'loss': loss, 'status': STATUS_OK, 'model': model, 'cv_fair': cv_fair, 'cv_acc': cv_acc }
def parallel(X_train, y_train, X_test=None, y_test=None, floating=True, max_number_features=10, feature_generator=None, folds=3, number_cvs=1): pipeline_train = copy.deepcopy(feature_generator.pipeline_) X_train_transformed = pipeline_train.fit_transform(X_train) X_test_transformed = pipeline_train.transform(X_test) best_score = -1 best_feature_combination = None featurenames = [] for myf in feature_generator.numeric_features: featurenames.append(str(myf)) mp_globalsfs.data_per_fold = [] for ncvs in range(number_cvs): for train, test in StratifiedKFold(n_splits=folds, random_state=42 + ncvs).split( X_train, y_train): pipeline_fold = copy.deepcopy(feature_generator.pipeline_) X_train_fold = pipeline_fold.fit_transform(X_train[train]) y_train_fold = y_train[train] X_test_fold = pipeline_fold.transform(X_train[test]) y_test_fold = y_train[test] mp_globalsfs.data_per_fold.append( (X_train_fold, y_train_fold, X_test_fold, y_test_fold)) current_feature_set = [] remaining_features = list(range(mp_globalsfs.data_per_fold[0][0].shape[1])) history = {} while (len(current_feature_set) <= max_number_features): new_feature_combos = [] for new_feature in remaining_features: feature_combo = [new_feature] feature_combo.extend(current_feature_set) # book-keeping to avoid infinite loops if frozenset(feature_combo) in history or len(feature_combo) == 0: continue new_feature_combos.append(feature_combo) mp_globalsfs.feature_combos = new_feature_combos print("Adding") # select best feature best_feature_id = -1 best_accuracy = -1 best_hyperparam = None with ProcessPool(max_workers=multiprocessing.cpu_count()) as pool: future = pool.map(execute_feature_combo, range(len(new_feature_combos))) iterator = future.result() while True: try: feature_combo_id, cv_score, hyperparam = next(iterator) feature_combo = mp_globalsfs.feature_combos[ feature_combo_id] history[frozenset(feature_combo)] = cv_score #print(f2str(feature_combo, featurenames) + ': ' + str(cv_score)) if best_accuracy < cv_score: best_feature_id = feature_combo[0] best_accuracy = cv_score best_hyperparam = hyperparam if cv_score > best_score: best_score = cv_score best_feature_combination = feature_combo except StopIteration: break except TimeoutError as error: print("function took longer than %d seconds" % error.args[1]) except ProcessExpired as error: print("%s. Exit code: %d" % (error, error.exitcode)) if best_feature_id == -1: break current_feature_set.append(best_feature_id) remaining_features.remove(best_feature_id) test_auc = get_test_auc(current_feature_set, X_train_transformed, y_train, X_test_transformed, y_test, best_hyperparam) print( f2str(current_feature_set, featurenames) + ' cv auc: ' + str(history[frozenset(current_feature_set)]) + ' test auc: ' + str(test_auc)) if floating: print("Floating") # select worst feature while True: new_feature_combos = [] features_removed = [] for i in range(len(current_feature_set) - 1, 0, -1): new_feature = current_feature_set[i] feature_combo = copy.deepcopy(current_feature_set) feature_combo.remove(new_feature) # book-keeping to avoid infinite loops if frozenset(feature_combo) in history or len( feature_combo) == 0: continue new_feature_combos.append(feature_combo) features_removed.append(new_feature) mp_globalsfs.feature_combos = new_feature_combos best_feature_id = -1 best_accuracy_new = -1 best_hyperparam_new = None with ProcessPool( max_workers=multiprocessing.cpu_count()) as pool: future = pool.map(execute_feature_combo, range(len(new_feature_combos))) iterator = future.result() while True: try: feature_combo_id, cv_score, hyperparam = next( iterator) feature_combo = mp_globalsfs.feature_combos[ feature_combo_id] history[frozenset(feature_combo)] = cv_score #print(f2str(feature_combo, featurenames) + ': ' + str(cv_score)) if cv_score > best_accuracy_new: best_feature_id = features_removed[ feature_combo_id] best_accuracy_new = cv_score best_hyperparam_new = hyperparam if cv_score > best_score: best_score = cv_score best_feature_combination = feature_combo except StopIteration: break except TimeoutError as error: print("function took longer than %d seconds" % error.args[1]) except ProcessExpired as error: print("%s. Exit code: %d" % (error, error.exitcode)) except Exception as error: print("function raised %s" % error) if best_accuracy_new < best_accuracy or best_feature_id == -1: break else: best_accuracy = best_accuracy_new current_feature_set.remove(best_feature_id) remaining_features.append(best_feature_id) test_auc = get_test_auc(current_feature_set, X_train_transformed, y_train, X_test_transformed, y_test, best_hyperparam_new) print( f2str(current_feature_set, featurenames) + ' cv auc: ' + str(history[frozenset(current_feature_set)]) + ' test auc: ' + str(test_auc)) mask = np.zeros(X_train_transformed.shape[1], dtype=bool) for fc in best_feature_combination: mask[fc] = True mask_selection = MaskSelection(mask) X_train_new = mask_selection.fit_transform(X_train_transformed) X_test_new = mask_selection.transform(X_test_transformed) return X_train_new, X_test_new
def f_clf1(mask): model = Pipeline([ ('selection', MaskSelection(mask)), ('clf', clf) ]) return model
def f_clf1(mask): model = Pipeline([('selection', MaskSelection(mask)), ('clf', LogisticRegression())]) return model