示例#1
0
文件: SFFS_run.py 项目: BigDaMa/DFS
def execute_feature_combo1(feature_combo, feature_combo_id=0, params=pARAMS):
    mask = np.zeros(mp_globalsfs.data_per_fold[0][0].shape[1], dtype=bool)
    for fc in feature_combo:
        mask[fc] = True

    mp_globalsfs.mask = mask

    hyperparameter_search_scores = []
    for c in params:
        pipeline = Pipeline([('imputation', SimpleImputer()),
                             ('selection', MaskSelection(mask)),
                             get_model(c)])

        mp_globalsfs.parameter = c

        cv_scores = []
        with Pool(processes=multiprocessing.cpu_count()) as p:
            cv_scores = list(
                tqdm.tqdm(p.imap(run_fold,
                                 range(len(mp_globalsfs.data_per_fold))),
                          total=len(mp_globalsfs.data_per_fold)))

        hyperparameter_search_scores.append(np.mean(cv_scores))

    return (feature_combo_id, np.max(hyperparameter_search_scores),
            params[np.argmax(hyperparameter_search_scores)])
示例#2
0
def f_clf1(mask):
    # Assembing pipeline
    model = Pipeline([('scale', MinMaxScaler()),
                      ('selection', MaskSelection(mask)),
                      ('clf', LogisticRegression())])

    return model
示例#3
0
def objective(features):
    model = Pipeline([('selection', MaskSelection(features)),
                      ('clf', LogisticRegression())])

    robust_scorer = make_scorer(
        robust_score,
        greater_is_better=True,
        X=X_train,
        y=y_train,
        feature_selector=model.named_steps['selection'])
    robust_scorer_test = make_scorer(
        robust_score_test,
        greater_is_better=True,
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        feature_selector=model.named_steps['selection'])

    ncv = 5

    cv_acc = np.mean(
        cross_val_score(model,
                        X_train,
                        pd.DataFrame(y_train),
                        cv=StratifiedKFold(ncv, random_state=42),
                        scoring=auc_scorer))
    cv_fair = 1.0 - np.mean(
        cross_val_score(model,
                        X_train,
                        pd.DataFrame(y_train),
                        cv=StratifiedKFold(ncv, random_state=42),
                        scoring=fair_train))
    cv_robust = 1.0 - np.mean(
        cross_val_score(model,
                        X_train,
                        pd.DataFrame(y_train),
                        cv=StratifiedKFold(ncv, random_state=42),
                        scoring=robust_scorer))
    #cv_robust = 1.0

    print('cv acc: ' + str(cv_acc) + ' cv fair: ' + str(cv_fair) +
          ' cv robust: ' + str(cv_robust))

    model.fit(X_train, pd.DataFrame(y_train))
    test_acc = auc_scorer(model, X_test, pd.DataFrame(y_test))
    test_fair = 1.0 - fair_test(model, X_test, pd.DataFrame(y_test))
    test_robust = 1.0 - robust_scorer_test(model, X_test, pd.DataFrame(y_test))

    simplicity = -1 * np.sum(features)

    my_global_variable.satisfied_constraints.append(
        [min(test_acc, cv_acc), min(test_fair, cv_fair)])
    my_global_variable.times.append(time.time() - start_time)
    my_global_variable.iterations.append(my_global_variable.current_iteration)

    my_global_variable.current_iteration += 1

    return [cv_acc, cv_fair, cv_robust, simplicity]
    def f_clf1(hps):
        mask = np.zeros(len(hps), dtype=bool)
        for k, v in hps.items():
            mask[int(k.split('_')[1])] = v

        for mask_i in range(len(mask)):
            hps['f_' + str(mask_i)] = mask[mask_i]

        model = Pipeline([('selection', MaskSelection(mask)), ('clf', clf)])

        return model, hps
示例#5
0
文件: SFFS_run.py 项目: BigDaMa/DFS
def run_fold(cv):
    mask = mp_globalsfs.mask
    c = mp_globalsfs.parameter
    pipeline = Pipeline([('imputation', SimpleImputer()),
                         ('selection', MaskSelection(mask)),
                         get_model(c)])

    X_train, y_train, X_test, y_test = mp_globalsfs.data_per_fold[cv]
    pipeline.fit(X_train, pd.DataFrame(y_train))

    return auc_scorer(pipeline, X_test, y_test)
示例#6
0
def get_test_auc(feature_combo, X_train_transformed, y_train,
                 X_test_transformed, y_test, hyperparam):

    mask = np.zeros(mp_globalsfs.data_per_fold[0][0].shape[1], dtype=bool)
    for fc in feature_combo:
        mask[fc] = True

    pipeline = Pipeline([('selection', MaskSelection(mask)),
                         get_model_test(hyperparam)])

    pipeline.fit(X_train_transformed, y_train)

    return auc_scorer(pipeline, X_test_transformed, y_test)
示例#7
0
	def f_to_min1():
		mask = np.array([True for f_i in range(X_train.shape[1])])

		pipeline = Pipeline([
			('selection', MaskSelection(mask)),
			('clf', clf)
		])

		grid_result = run_grid_search(pipeline, X_train, y_train, X_validation, y_validation,
									  accuracy_scorer, sensitive_ids,
									  min_fairness, min_accuracy, min_robustness, max_number_features,
									  model_hyperparameters=model_hyperparameters, start_time=start_time)

		return grid_result
示例#8
0
def get_test_auc_and_coeff(feature_combo, X_train_transformed, y_train, X_test_transformed, y_test, hyperparam):

	mask = np.zeros(mp_globalsfs.data_per_fold[0][0].shape[1], dtype=bool)
	for fc in feature_combo:
		mask[fc] = True

	pipeline = Pipeline([
		('imputation', SimpleImputer()),
		('selection', MaskSelection(mask)),
		get_model(hyperparam)
	])

	pipeline.fit(X_train_transformed, y_train)

	return auc_scorer(pipeline, X_test_transformed, y_test), pipeline.named_steps['clf'].coef_[0]
示例#9
0
def get_test_auc(feature_combo, X_train_transformed, y_train,
                 X_test_transformed, y_test, hyperparam):

    mask = np.zeros(mp_globalsfs.data_per_fold[0][0].shape[1], dtype=bool)
    for fc in feature_combo:
        mask[fc] = True

    pipeline = Pipeline([('imputation', SimpleImputer()),
                         ('selection', MaskSelection(mask)),
                         get_model(hyperparam)])

    pipeline.fit(X_train_transformed, y_train)

    # save model
    pickle.dump(pipeline, open("/tmp/pipeline.p", "wb"))

    return auc_scorer(pipeline, X_test_transformed, y_test)
    def f_to_min1():
        mask = np.array([True for f_i in range(X_train.shape[1])])

        pipeline = Pipeline([('selection', MaskSelection(mask)), ('clf', clf)])

        pipeline.fit(X_train, y_train)

        validation_number_features = 1.0
        validation_acc = auc_scorer(pipeline, X_validation,
                                    pd.DataFrame(y_validation))

        validation_fair = 0.0
        if type(sensitive_ids) != type(None) and min_fairness > 0.0:
            validation_fair = 1.0 - fair_validation(pipeline, X_validation,
                                                    pd.DataFrame(y_validation))
        validation_robust = 0.0
        if min_robustness > 0.0:
            validation_robust = 1.0 - robust_score_test(
                eps=0.1,
                X_test=X_validation,
                y_test=y_validation,
                model=pipeline.named_steps['clf'],
                feature_selector=pipeline.named_steps['selection'],
                scorer=auc_scorer)

        loss = 0.0
        if min_fairness > 0.0 and validation_fair < min_fairness:
            loss += (min_fairness - validation_fair)**2
        if min_accuracy > 0.0 and validation_acc < min_accuracy:
            loss += (min_accuracy - validation_acc)**2
        if min_robustness > 0.0 and validation_robust < min_robustness:
            loss += (min_robustness - validation_robust)**2
        print(loss)

        current_time = time.time() - start_time

        return {
            'loss': loss,
            'model': pipeline,
            'cv_fair': validation_fair,
            'cv_acc': validation_acc,
            'cv_robust': validation_robust,
            'cv_number_features': validation_number_features,
            'time': current_time
        }
示例#11
0
    def f_clf1(hps):
        mask = np.zeros(len(hps), dtype=bool)
        for k, v in hps.items():
            mask[int(k.split('_')[1])] = v

        #repair number of features if neccessary
        max_k = max(int(max_number_features * X_train.shape[1]), 1)
        if np.sum(mask) > max_k:
            id_features_used = np.nonzero(mask)[
                0]  # indices where features are used
            np.random.shuffle(id_features_used)  # shuffle ids
            ids_tb_deactived = id_features_used[max_k:]  # deactivate features
            for item_to_remove in ids_tb_deactived:
                mask[item_to_remove] = False

        for mask_i in range(len(mask)):
            hps['f_' + str(mask_i)] = mask[mask_i]

        pipeline = Pipeline([('selection', MaskSelection(mask)), ('clf', clf)])

        return pipeline, hps
示例#12
0
文件: SFFS_size.py 项目: BigDaMa/DFS
def execute_feature_combo1(feature_combo, feature_combo_id=0, params=pARAMS):
    mask = np.zeros(mp_globalsfs.data_per_fold[0][0].shape[1], dtype=bool)
    for fc in feature_combo:
        mask[fc] = True

    hyperparameter_search_scores = []
    for c in params:
        pipeline = Pipeline([('imputation', SimpleImputer()),
                             ('selection', MaskSelection(mask)),
                             get_model(c)])

        cv_scores = []
        for cv in range(len(mp_globalsfs.data_per_fold)):
            X_train, y_train, X_test, y_test = mp_globalsfs.data_per_fold[cv]
            pipeline.fit(X_train, pd.DataFrame(y_train))

            cv_scores.append(auc_scorer(pipeline, X_test, y_test))
        hyperparameter_search_scores.append(np.mean(cv_scores))

    return (feature_combo_id, np.max(hyperparameter_search_scores),
            params[np.argmax(hyperparameter_search_scores)])
示例#13
0
def objective(features):
	model = Pipeline([
		('selection', MaskSelection(features)),
		('clf', LogisticRegression())
	])

	robust_scorer = make_scorer(robust_score, greater_is_better=True, X=X_train, y=y_train,
								feature_selector=model.named_steps['selection'])
	robust_scorer_test = make_scorer(robust_score_test, greater_is_better=True, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test,
								feature_selector=model.named_steps['selection'])

	ncv = 5

	cv_acc = np.mean(
		cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=auc_scorer))
	cv_fair = 1.0 - np.mean(cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=fair_train))
	cv_robust = 1.0 - np.mean(cross_val_score(model, X_train, pd.DataFrame(y_train), cv=StratifiedKFold(ncv, random_state=42), scoring=robust_scorer))
	#cv_robust = 1.0

	print('cv acc: ' + str(cv_acc) + ' cv fair: ' + str(cv_fair) + ' cv robust: ' + str(cv_robust))

	if cv_acc > min_accuracy and cv_fair > min_fairness and cv_robust > min_robustness:
		model.fit(X_train, pd.DataFrame(y_train))
		test_acc = auc_scorer(model, X_test, pd.DataFrame(y_test))
		test_fair = 1.0 - fair_test(model, X_test, pd.DataFrame(y_test))
		test_robust = 1.0 - robust_scorer_test(model, X_test, pd.DataFrame(y_test))

		print('acc: ' + str(test_acc) + ' fair: ' + str(test_fair) + ' robust: ' + str(test_robust))

		if test_acc > min_accuracy and test_fair > min_fairness and test_robust > min_robustness:
			my_global_variable.global_check = True

			print("selected features: " + str(np.array(names)[features]))

	simplicity = -1 * np.sum(features)

	#change objectives
	#cv_acc = 1.0
	return [cv_acc, cv_fair, cv_robust, simplicity]
示例#14
0
def objective(hps):
    mask = np.zeros(len(hps), dtype=bool)
    for k, v in hps.items():
        mask[int(k.split('_')[1])] = v

    model = Pipeline([('selection', MaskSelection(mask)),
                      ('clf', LogisticRegression())])

    robust_scorer = make_scorer(
        robust_score,
        greater_is_better=True,
        X=X_train,
        y=y_train,
        feature_selector=model.named_steps['selection'])
    robust_scorer_test = make_scorer(
        robust_score_test,
        greater_is_better=True,
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        feature_selector=model.named_steps['selection'])

    ncv = 5

    cv_acc = np.mean(
        cross_val_score(model,
                        X_train,
                        pd.DataFrame(y_train),
                        cv=StratifiedKFold(ncv, random_state=42),
                        scoring=auc_scorer))
    cv_fair = 1.0 - np.mean(
        cross_val_score(model,
                        X_train,
                        pd.DataFrame(y_train),
                        cv=StratifiedKFold(ncv, random_state=42),
                        scoring=fair_train))
    cv_robust = 1.0 - np.mean(
        cross_val_score(model,
                        X_train,
                        pd.DataFrame(y_train),
                        cv=StratifiedKFold(ncv, random_state=42),
                        scoring=robust_scorer))
    #cv_robust = 1.0

    print('cv acc: ' + str(cv_acc) + ' cv fair: ' + str(cv_fair) +
          ' cv robust: ' + str(cv_robust))

    cv_model_gen = 1.0

    loss = 0.0
    if cv_acc >= min_accuracy and cv_fair >= min_fairness and cv_robust >= min_robustness and cv_model_gen >= min_avg_model_accuracy:
        loss = (min_accuracy - cv_acc) + (min_fairness - cv_fair) + (
            min_robustness - cv_robust) + (min_avg_model_accuracy -
                                           cv_model_gen)
    else:
        if cv_fair < min_fairness:
            loss += (min_fairness - cv_fair)**2
        if cv_acc < min_accuracy:
            loss += (min_accuracy - cv_acc)**2
        if cv_robust < min_robustness:
            loss += (min_robustness - cv_robust)**2
        if cv_model_gen < min_avg_model_accuracy:
            loss += (min_avg_model_accuracy - cv_model_gen)**2

    return {
        'loss': loss,
        'status': STATUS_OK,
        'model': model,
        'cv_fair': cv_fair,
        'cv_acc': cv_acc
    }
示例#15
0
def parallel(X_train,
             y_train,
             X_test=None,
             y_test=None,
             floating=True,
             max_number_features=10,
             feature_generator=None,
             folds=3,
             number_cvs=1):

    pipeline_train = copy.deepcopy(feature_generator.pipeline_)
    X_train_transformed = pipeline_train.fit_transform(X_train)
    X_test_transformed = pipeline_train.transform(X_test)

    best_score = -1
    best_feature_combination = None

    featurenames = []
    for myf in feature_generator.numeric_features:
        featurenames.append(str(myf))

    mp_globalsfs.data_per_fold = []
    for ncvs in range(number_cvs):
        for train, test in StratifiedKFold(n_splits=folds,
                                           random_state=42 + ncvs).split(
                                               X_train, y_train):
            pipeline_fold = copy.deepcopy(feature_generator.pipeline_)
            X_train_fold = pipeline_fold.fit_transform(X_train[train])
            y_train_fold = y_train[train]

            X_test_fold = pipeline_fold.transform(X_train[test])
            y_test_fold = y_train[test]

            mp_globalsfs.data_per_fold.append(
                (X_train_fold, y_train_fold, X_test_fold, y_test_fold))

    current_feature_set = []
    remaining_features = list(range(mp_globalsfs.data_per_fold[0][0].shape[1]))

    history = {}
    while (len(current_feature_set) <= max_number_features):

        new_feature_combos = []
        for new_feature in remaining_features:
            feature_combo = [new_feature]
            feature_combo.extend(current_feature_set)

            # book-keeping to avoid infinite loops
            if frozenset(feature_combo) in history or len(feature_combo) == 0:
                continue
            new_feature_combos.append(feature_combo)

        mp_globalsfs.feature_combos = new_feature_combos

        print("Adding")
        # select best feature
        best_feature_id = -1
        best_accuracy = -1
        best_hyperparam = None
        with ProcessPool(max_workers=multiprocessing.cpu_count()) as pool:
            future = pool.map(execute_feature_combo,
                              range(len(new_feature_combos)))

            iterator = future.result()
            while True:
                try:
                    feature_combo_id, cv_score, hyperparam = next(iterator)
                    feature_combo = mp_globalsfs.feature_combos[
                        feature_combo_id]
                    history[frozenset(feature_combo)] = cv_score
                    #print(f2str(feature_combo, featurenames) + ': ' + str(cv_score))

                    if best_accuracy < cv_score:
                        best_feature_id = feature_combo[0]
                        best_accuracy = cv_score
                        best_hyperparam = hyperparam

                    if cv_score > best_score:
                        best_score = cv_score
                        best_feature_combination = feature_combo

                except StopIteration:
                    break
                except TimeoutError as error:
                    print("function took longer than %d seconds" %
                          error.args[1])
                except ProcessExpired as error:
                    print("%s. Exit code: %d" % (error, error.exitcode))

        if best_feature_id == -1:
            break

        current_feature_set.append(best_feature_id)
        remaining_features.remove(best_feature_id)

        test_auc = get_test_auc(current_feature_set, X_train_transformed,
                                y_train, X_test_transformed, y_test,
                                best_hyperparam)
        print(
            f2str(current_feature_set, featurenames) + ' cv auc: ' +
            str(history[frozenset(current_feature_set)]) + ' test auc: ' +
            str(test_auc))

        if floating:
            print("Floating")
            # select worst feature
            while True:

                new_feature_combos = []
                features_removed = []
                for i in range(len(current_feature_set) - 1, 0, -1):
                    new_feature = current_feature_set[i]
                    feature_combo = copy.deepcopy(current_feature_set)
                    feature_combo.remove(new_feature)

                    # book-keeping to avoid infinite loops
                    if frozenset(feature_combo) in history or len(
                            feature_combo) == 0:
                        continue
                    new_feature_combos.append(feature_combo)
                    features_removed.append(new_feature)

                mp_globalsfs.feature_combos = new_feature_combos

                best_feature_id = -1
                best_accuracy_new = -1
                best_hyperparam_new = None
                with ProcessPool(
                        max_workers=multiprocessing.cpu_count()) as pool:
                    future = pool.map(execute_feature_combo,
                                      range(len(new_feature_combos)))

                    iterator = future.result()
                    while True:
                        try:
                            feature_combo_id, cv_score, hyperparam = next(
                                iterator)
                            feature_combo = mp_globalsfs.feature_combos[
                                feature_combo_id]
                            history[frozenset(feature_combo)] = cv_score
                            #print(f2str(feature_combo, featurenames) + ': ' + str(cv_score))

                            if cv_score > best_accuracy_new:
                                best_feature_id = features_removed[
                                    feature_combo_id]
                                best_accuracy_new = cv_score
                                best_hyperparam_new = hyperparam

                            if cv_score > best_score:
                                best_score = cv_score
                                best_feature_combination = feature_combo

                        except StopIteration:
                            break
                        except TimeoutError as error:
                            print("function took longer than %d seconds" %
                                  error.args[1])
                        except ProcessExpired as error:
                            print("%s. Exit code: %d" %
                                  (error, error.exitcode))
                        except Exception as error:
                            print("function raised %s" % error)

                if best_accuracy_new < best_accuracy or best_feature_id == -1:
                    break
                else:
                    best_accuracy = best_accuracy_new

                    current_feature_set.remove(best_feature_id)
                    remaining_features.append(best_feature_id)

                    test_auc = get_test_auc(current_feature_set,
                                            X_train_transformed, y_train,
                                            X_test_transformed, y_test,
                                            best_hyperparam_new)
                    print(
                        f2str(current_feature_set, featurenames) +
                        ' cv auc: ' +
                        str(history[frozenset(current_feature_set)]) +
                        ' test auc: ' + str(test_auc))

    mask = np.zeros(X_train_transformed.shape[1], dtype=bool)
    for fc in best_feature_combination:
        mask[fc] = True

    mask_selection = MaskSelection(mask)
    X_train_new = mask_selection.fit_transform(X_train_transformed)
    X_test_new = mask_selection.transform(X_test_transformed)

    return X_train_new, X_test_new
示例#16
0
文件: evolution.py 项目: BigDaMa/DFS
	def f_clf1(mask):
		model = Pipeline([
			('selection', MaskSelection(mask)),
			('clf', clf)
		])
		return model
示例#17
0
 def f_clf1(mask):
     model = Pipeline([('selection', MaskSelection(mask)),
                       ('clf', LogisticRegression())])
     return model