예제 #1
0
        min_fairness = 0.0
        min_robustness = config['robustness']
        max_number_features = config['k']

        max_search_time = time_limit

        model = None
        if config['model'] == 'Logistic Regression':
            model = LogisticRegression(class_weight='balanced')
            if type(config['privacy']) != type(None):
                model = models.LogisticRegression(epsilon=config['privacy'],
                                                  class_weight='balanced')
        elif config['model'] == 'Gaussian Naive Bayes':
            model = GaussianNB()
            if type(config['privacy']) != type(None):
                model = models.GaussianNB(epsilon=config['privacy'])
        elif config['model'] == 'Decision Tree':
            model = DecisionTreeClassifier(class_weight='balanced')
            if type(config['privacy']) != type(None):
                model = PrivateRandomForest(n_estimators=1,
                                            epsilon=config['privacy'])

        mp_global.clf = model
        # define rankings
        rankings = [
            variance, chi2_score_wo, fcbf, my_fisher_score,
            mutual_info_classif, my_mcfs
        ]
        # rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking
        # rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking
        # rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking
예제 #2
0
fb_dataset = util.Bunch()
fb_dataset.data = toNum(np.array(df))
fb_dataset.target = np.array([int(df.at[row, 'age']) for row in df.index])

print("filename", filepath + filename)
print("e-differential privacy")
X_train, X_test, y_train, y_test = fb_dataset.data, fb_dataset.data, fb_dataset.target, fb_dataset.target
epsilons = np.logspace(-2, 2, 50)
minbounds = np.amin(X_train, axis=0)
maxbounds = np.amax(X_train, axis=0)
bounds = [(minbounds[i], maxbounds[i]) for i in range(X_train[0].size)]

accuracy = list()
epsilon = 1
clf = models.GaussianNB(bounds=bounds, epsilon=epsilon)
clf.fit(X_train, y_train)
predict = clf.predict(X_test)
# print(predict.shape)

print("epsilon: ", epsilon)
print("accuracy: ", accuracy_score(y_test, predict))
for row in df.index:
    if df.at[row, 'gender'] == 0:
        df.at[row, 'gender'] = 'male'
    elif df.at[row, 'gender'] == 1:
        df.at[row, 'gender'] = 'female'
    else:
        df.at[row, 'gender'] = 'male'

    df.at[row, 'age'] = predict[row]
예제 #3
0
     }
     if most_uncertain_f['privacy_choice'][0]:
         model = models.LogisticRegression(
             epsilon=most_uncertain_f['privacy_specified'][0],
             class_weight='balanced')
         mp_global.model_hyperparameters['epsilon'] = [
             most_uncertain_f['privacy_specified'][0]
         ]
 elif model_choice == 1:
     model = GaussianNB()
     mp_global.model_hyperparameters = {
         'var_smoothing':
         [1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6]
     }
     if most_uncertain_f['privacy_choice'][0]:
         model = models.GaussianNB(
             epsilon=most_uncertain_f['privacy_specified'][0])
         mp_global.model_hyperparameters['epsilon'] = [
             most_uncertain_f['privacy_specified'][0]
         ]
 elif model_choice == 2:
     model = DecisionTreeClassifier(class_weight='balanced')
     mp_global.model_hyperparameters = {
         'max_depth': [1, 2, 3, 4, 5, 6, 7]
     }
     if most_uncertain_f['privacy_choice'][0]:
         model = PrivateRandomForest(
             n_estimators=1,
             epsilon=most_uncertain_f['privacy_specified'][0])
         mp_global.model_hyperparameters['epsilon'] = [
             most_uncertain_f['privacy_specified'][0]
         ]
예제 #4
0
def uncertainty_sampling(training_dataset_ids, all_current_models):
    time_limit = 30 * 60  #60 * 60 * 3

    training_dataset_ids = training_dataset_ids.tolist()

    if '1240' in training_dataset_ids:
        training_dataset_ids.remove('1240')
    if '42132' in training_dataset_ids:
        training_dataset_ids.remove('42132')

    def maximize_uncertainty(hps):
        print(hps)

        X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation(
            dataset_key=hps['data'])
        is_regression = False

        # run on tiny sample
        if X_train.shape[0] > 100:
            if is_regression:
                X_train_tiny, _, y_train_tiny, _ = train_test_split(
                    X_train, y_train, train_size=100, random_state=42)
            else:
                X_train_tiny, _, y_train_tiny, _ = train_test_split(
                    X_train,
                    y_train,
                    train_size=100,
                    random_state=42,
                    stratify=y_train)
        else:
            X_train_tiny = X_train
            y_train_tiny = y_train

        print(X_train.shape)

        if type(sensitive_ids) != type(None):
            fair_train_tiny = make_scorer(
                true_positive_rate_score,
                greater_is_better=True,
                sensitive_data=X_train_tiny[:, sensitive_ids[0]])

        mp_global.X_train = X_train
        mp_global.X_validation = X_validation
        mp_global.X_train_val = X_train_val
        mp_global.X_test = X_test
        mp_global.y_train = y_train
        mp_global.y_validation = y_validation
        mp_global.y_train_val = y_train_val
        mp_global.y_test = y_test
        mp_global.names = names
        mp_global.sensitive_ids = sensitive_ids

        mp_global.cv_splitter = StratifiedKFold(5, random_state=42)
        mp_global.accuracy_scorer = make_scorer(f1_score)

        mp_global.avoid_robustness = False

        cv_k = 1.0
        cv_privacy = hps['privacy']

        model = LogisticRegression(class_weight='balanced')
        if type(cv_privacy) != type(None):
            model = models.LogisticRegression(epsilon=cv_privacy,
                                              class_weight='balanced')

        if type(cv_privacy) == type(None):
            cv_privacy = X_train_tiny.shape[0]

        robust_scorer = make_scorer(robust_score,
                                    greater_is_better=True,
                                    X=X_train_tiny,
                                    y=y_train_tiny,
                                    model=model,
                                    feature_selector=None,
                                    scorer=mp_global.accuracy_scorer)

        small_start_time = time.time()

        scoring = {'AUC': mp_global.accuracy_scorer}
        if not mp_global.avoid_robustness:
            scoring['Robustness'] = robust_scorer
        if type(sensitive_ids) != type(None):
            scoring['Fairness'] = fair_train_tiny

        cv = GridSearchCV(model,
                          param_grid={},
                          scoring=scoring,
                          refit=False,
                          cv=mp_global.cv_splitter)
        cv.fit(X_train_tiny, pd.DataFrame(y_train_tiny))
        cv_acc = cv.cv_results_['mean_test_AUC'][0]

        cv_fair = 0.0
        if type(sensitive_ids) != type(None):
            cv_fair = 1.0 - cv.cv_results_['mean_test_Fairness'][0]

        cv_robust = 0.0
        if not mp_global.avoid_robustness:
            cv_robust = 1.0 - cv.cv_results_['mean_test_Robustness'][0]

        cv_time = time.time() - small_start_time

        # construct feature vector
        feature_list = []
        # user-specified constraints
        feature_list.append(hps['accuracy'])
        feature_list.append(hps['fairness'])
        feature_list.append(hps['k'])
        feature_list.append(hps['k'] * X_train.shape[1])
        feature_list.append(hps['robustness'])
        feature_list.append(cv_privacy)
        feature_list.append(hps['search_time'])
        # differences to sample performance
        feature_list.append(cv_acc - hps['accuracy'])
        feature_list.append(cv_fair - hps['fairness'])
        feature_list.append(cv_k - hps['k'])
        feature_list.append((cv_k - hps['k']) * X_train.shape[1])
        feature_list.append(cv_robust - hps['robustness'])
        feature_list.append(cv_time)
        # privacy constraint is always satisfied => difference always zero => constant => unnecessary

        # metadata features
        feature_list.append(X_train.shape[0])  # number rows
        feature_list.append(X_train.shape[1])  # number columns

        #models
        feature_list.append(hps['model'] == 'Decision Tree')
        feature_list.append(hps['model'] == 'Gaussian Naive Bayes')
        feature_list.append(hps['model'] == 'Logistic Regression')

        features = np.array(feature_list).reshape(1, -1)

        # predict the best model and calculate uncertainty

        print(features)

        #now predict with models
        aggregated_certainty = 0
        print("uncertainty")
        for model_i in range(len(all_current_models)):
            certainty = np.abs(
                all_current_models[model_i].predict_proba(features)[0, 0] -
                0.5)
            aggregated_certainty += certainty

        print('Certainty: ' + str(aggregated_certainty))

        return {
            'loss': aggregated_certainty,
            'status': STATUS_OK,
            'features': features,
            'search_time': hps['search_time'],
            'constraints': hps
        }

    space = {
        'data':
        hp.choice('data_choice', training_dataset_ids),
        'model':
        hp.choice(
            'model_choice',
            [
                'Logistic Regression',
                'Gaussian Naive Bayes',
                'Decision Tree'  # , 'Random Forest'
            ]),
        'k':
        hp.choice('k_choice', [(1.0), (hp.uniform('k_specified', 0, 1))]),
        'accuracy':
        hp.uniform('accuracy_specified', 0.5, 1),
        'fairness':
        hp.choice('fairness_choice',
                  [(0.0), (hp.uniform('fairness_specified', 0.8, 1))]),
        'privacy':
        hp.choice('privacy_choice',
                  [(None), (hp.lognormal('privacy_specified', 0, 1))]),
        'robustness':
        hp.choice('robustness_choice',
                  [(0.0), (hp.uniform('robustness_specified', 0.8, 1))]),
        'search_time':
        hp.uniform('search_time_specified', 10, time_limit),  # in seconds
    }

    trials = Trials()
    fmin(maximize_uncertainty,
         space=space,
         algo=tpe.suggest,
         max_evals=100,
         trials=trials,
         show_progressbar=True)

    ### now run most uncertain trial

    number_of_runs = 1

    # break, once convergence tolerance is reached and generate new dataset
    last_trial = trials.best_trial
    most_uncertain_f = last_trial['misc']['vals']
    # print(most_uncertain_f)

    ##specifiy data
    run_counter = 0
    current_run_time_id = time.time()
    path = pathlib.Path('/tmp/experiment_uncertainty/run' + str(0))
    path.mkdir(parents=True, exist_ok=True)

    selected_dataset_id = training_dataset_ids[most_uncertain_f['data_choice']
                                               [0]]
    X_train, X_validation, X_train_val, X_test, y_train, y_validation, y_train_val, y_test, names, sensitive_ids, key, sensitive_attribute_id = get_fair_data1_validation(
        dataset_key=selected_dataset_id)
    is_regression = False

    mp_global.X_train = X_train
    mp_global.X_validation = X_validation
    mp_global.X_train_val = X_train_val
    mp_global.X_test = X_test
    mp_global.y_train = y_train
    mp_global.y_validation = y_validation
    mp_global.y_train_val = y_train_val
    mp_global.y_test = y_test
    mp_global.names = names
    mp_global.sensitive_ids = sensitive_ids

    if is_regression:
        mp_global.cv_splitter = KFold(5, random_state=42)
        mp_global.accuracy_scorer = make_scorer(r2_score)
    else:
        mp_global.cv_splitter = StratifiedKFold(5, random_state=42)
        mp_global.accuracy_scorer = make_scorer(f1_score)
    mp_global.avoid_robustness = False

    min_accuracy = most_uncertain_f['accuracy_specified'][0]
    min_fairness = 0.0
    if most_uncertain_f['fairness_choice'][0]:
        min_fairness = most_uncertain_f['fairness_specified'][0]
    min_robustness = 0.0
    if most_uncertain_f['robustness_choice'][0]:
        min_robustness = most_uncertain_f['robustness_specified'][0]
    max_number_features = 1.0
    if most_uncertain_f['k_choice'][0]:
        max_number_features = most_uncertain_f['k_specified'][0]

    max_search_time = most_uncertain_f['search_time_specified'][0]

    # Execute each search strategy with a given time limit (in parallel)
    # maybe run multiple times to smooth stochasticity

    model = None
    print(most_uncertain_f)
    if most_uncertain_f['model_choice'][0] == 0:
        model = LogisticRegression(class_weight='balanced')
        if most_uncertain_f['privacy_choice'][0]:
            model = models.LogisticRegression(
                epsilon=most_uncertain_f['privacy_specified'][0],
                class_weight='balanced')
    elif most_uncertain_f['model_choice'][0] == 1:
        model = GaussianNB()
        if most_uncertain_f['privacy_choice'][0]:
            model = models.GaussianNB(
                epsilon=most_uncertain_f['privacy_specified'][0])
    elif most_uncertain_f['model_choice'][0] == 2:
        model = DecisionTreeClassifier(class_weight='balanced')
        if most_uncertain_f['privacy_choice'][0]:
            model = PrivateRandomForest(
                n_estimators=1,
                epsilon=most_uncertain_f['privacy_specified'][0])
    elif most_uncertain_f['model_choice'][0] == 3:
        model = RandomForestClassifier(n_estimators=100,
                                       class_weight='balanced')
        if most_uncertain_f['privacy_choice'][0]:
            model = PrivateRandomForest(
                n_estimators=100,
                epsilon=most_uncertain_f['privacy_specified'][0])

    print(model)

    mp_global.clf = model
    # define rankings
    rankings = [
        variance, chi2_score_wo, fcbf, my_fisher_score, mutual_info_classif,
        my_mcfs
    ]
    # rankings.append(partial(model_score, estimator=ExtraTreesClassifier(n_estimators=1000))) #accuracy ranking
    # rankings.append(partial(robustness_score, model=model, scorer=auc_scorer)) #robustness ranking
    # rankings.append(partial(fairness_score, estimator=ExtraTreesClassifier(n_estimators=1000), sensitive_ids=sensitive_ids)) #fairness ranking
    rankings.append(partial(model_score,
                            estimator=ReliefF(n_neighbors=10)))  # relieff

    mp_global.min_accuracy = min_accuracy
    mp_global.min_fairness = min_fairness
    mp_global.min_robustness = min_robustness
    mp_global.max_number_features = max_number_features
    mp_global.max_search_time = max_search_time

    mp_global.configurations = []
    # add single rankings
    strategy_id = 1
    for r in range(len(rankings)):
        for run in range(number_of_runs):
            configuration = {}
            configuration['ranking_functions'] = copy.deepcopy([rankings[r]])
            configuration['run_id'] = copy.deepcopy(run)
            configuration['main_strategy'] = copy.deepcopy(weighted_ranking)
            configuration['strategy_id'] = copy.deepcopy(strategy_id)
            mp_global.configurations.append(configuration)
        strategy_id += 1

    main_strategies = [
        TPE, simulated_annealing, evolution, exhaustive, forward_selection,
        backward_selection, forward_floating_selection,
        backward_floating_selection, recursive_feature_elimination,
        fullfeatures
    ]

    # run main strategies

    for strategy in main_strategies:
        for run in range(number_of_runs):
            configuration = {}
            configuration['ranking_functions'] = []
            configuration['run_id'] = copy.deepcopy(run)
            configuration['main_strategy'] = copy.deepcopy(strategy)
            configuration['strategy_id'] = copy.deepcopy(strategy_id)
            mp_global.configurations.append(configuration)
        strategy_id += 1

    # 6#17
    with ProcessPool(max_workers=17) as pool:
        future = pool.map(my_function,
                          range(len(mp_global.configurations)),
                          timeout=max_search_time)

        iterator = future.result()
        while True:
            try:
                result = next(iterator)
            except StopIteration:
                break
            except TimeoutError as error:
                print("function took longer than %d seconds" % error.args[1])
            except ProcessExpired as error:
                print("%s. Exit code: %d" % (error, error.exitcode))
            except Exception as error:
                print("function raised %s" % error)
                print(error.traceback)  # Python's traceback of remote process

    #check which strategies were successful

    mappnames = {
        1: 'TPE(Variance)',
        2: 'TPE($\chi^2$)',
        3: 'TPE(FCBF)',
        4: 'TPE(Fisher)',
        5: 'TPE(MIM)',
        6: 'TPE(MCFS)',
        7: 'TPE(ReliefF)',
        8: 'TPE(NR)',
        9: 'SA(NR)',
        10: 'NSGA-II(NR)',
        11: 'ES(NR)',
        12: 'SFS(NR)',
        13: 'SBS(NR)',
        14: 'SFFS(NR)',
        15: 'SBFS(NR)',
        16: 'RFE(LR)',
        17: 'Complete Set'
    }

    def load_pickle(fname):
        data = []
        with open(fname, "rb") as f:
            while True:
                try:
                    data.append(pickle.load(f))
                except EOFError:
                    break
        return data

    def is_successfull_validation_and_test(exp_results):
        return len(exp_results
                   ) > 0 and 'success_test' in exp_results[-1] and exp_results[
                       -1]['success_test'] == True  # also on test satisfied

    def is_successfull_validation(exp_results):
        return len(exp_results) > 0 and 'Validation_Satisfied' in exp_results[
            -1]  # constraints were satisfied on validation set

    run_strategies_success_test = {}
    run_strategies_times = {}
    run_strategies_success_validation = {}

    rfolder = '/tmp/experiment_uncertainty/run' + str(0) + '/'

    validation_satisfied_by_any_strategy = False

    min_time = np.inf
    best_strategy = 0
    for s in range(1, len(mappnames) + 1):
        exp_results = []
        try:
            exp_results = load_pickle(rfolder + 'strategy' + str(s) +
                                      '.pickle')
        except:
            pass
        if is_successfull_validation_and_test(exp_results):
            runtime = exp_results[-1]['final_time']
            if runtime < min_time:
                min_time = runtime
                best_strategy = s

            run_strategies_success_test[s] = True
            run_strategies_times[s] = runtime
        else:
            run_strategies_success_test[s] = False

        run_strategies_success_validation[s] = is_successfull_validation(
            exp_results)
        if run_strategies_success_validation[s]:
            validation_satisfied_by_any_strategy = True

    strategy_success = np.zeros((1, len(mappnames)))
    for c_i in range(len(mappnames)):
        strategy_success[0, c_i] = run_strategies_success_test[c_i + 1]

    return last_trial['result']['features'], strategy_success
예제 #5
0
min_year = min(hb['Year Of Operation 19'])
max_year = max(hb['Year Of Operation 19'])
min_number = min(hb['No. of Positive axillary nodes'])
max_number = max(hb['No. of Positive axillary nodes'])

bounds = ([min_age, min_year, min_number], [max_age, max_year, max_number])

nonPrivate_score = GaussianNB()
nonPrivate_score.fit(X_train, y_train)
accuracy_nonPrivate_score = nonPrivate_score.score(X_test, y_test)
print(f"Accuracy without privatization : {accuracy_nonPrivate_score}")
print(f"Acc% without privatization = {accuracy_nonPrivate_score * 100}%")
acc = list()

for e in epsilons:
    clf = dp.GaussianNB(epsilon=e, bounds=bounds)
    clf.fit(X_train, y_train)
    acc.append(clf.score(X_test, y_test))

print(f"Max Acc% after privatization = {max(acc) * 100}%")
print(f"Min acc% after privatization = {min(acc) * 100}%")

import matplotlib.pyplot as plt

plt.title('Differential Privacy using Naive Bayes')
plt.xlabel('epsilons')
plt.ylabel('accuracy')
plt.plot(epsilons, acc)
plt.show()

# diff privacy after adding noise
예제 #6
0
 def start(self):
     module = models.GaussianNB()
     module.fit(self.__X_train, self.__y_train)
     self.__save(module)