def test_feature_selection_ga(model, features, target):
    fsga = FeatureSelectionGA(model,
                              features,
                              target,
                              ff_obj=FitnessFunction(2))
    result = fsga.generate(5)
    assert 5 == len(result)
示例#2
0
def ga(target, firma, control):
    print("Ejecutando Genetic Algorithm...")
    # Grupo control 2014 ###################################
    x = firma
    y = control.loc[:, target]
    model = linear_model.LinearRegression()
    ff = CustomFitnessFunctionClass(n_total_features=x.shape[1],
                                    n_splits=3,
                                    alpha=0.05)
    fsga = FeatureSelectionGA(model, x, y, ff_obj=ff)
    pop = fsga.generate(5000)
    print(pop)

    return
示例#3
0
        skf = StratifiedKFold(n_splits=self.n_splits)
        for train_index, test_index in skf.split(x, y):
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]
            if x_train.shape[0] != y_train.shape[0]:
                raise Exception()
            model.fit(x_train, y_train)
            predicted_y = model.predict(x_test)
            cv_set[test_index] = predicted_y

        P = accuracy_score(y, cv_set)
        fitness = (alpha * (1.0 - P) + (1.0 - alpha) *
                   (1.0 - (x.shape[1]) / total_features))
        return fitness


X, y = make_classification(n_samples=100,
                           n_features=15,
                           n_classes=3,
                           n_informative=4,
                           n_redundant=1,
                           n_repeated=2,
                           random_state=1)

model = linear_model.LogisticRegression(solver='lbfgs', multi_class='auto')
ff = CustomFitnessFunctionClass(n_total_features=X.shape[1],
                                n_splits=3,
                                alpha=0.05)
fsga = FeatureSelectionGA(model, X, y, ff_obj=ff)
pop = fsga.generate(1000)
示例#4
0
from sklearn.linear_model import LogisticRegression
from feature_selection_ga import FeatureSelectionGA
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_validate

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
x_train, x_test, y_train, y_test = train_test_split(X_digits,
                                                    y_digits,
                                                    test_size=0.2)
model = LogisticRegression(multi_class="multinomial", solver="newton-cg")
fsga = FeatureSelectionGA(model, x_train, y_train, verbose=1)
pop = fsga.generate(10)

# compare best individual with random individual
cross_validate(model,
               X_digits[:, np.array(fsga.best_ind) == 1],
               y_digits,
               scoring="balanced_accuracy",
               cv=5,
               return_train_score=True)

rand = np.random.choice(X_digits.shape[1],
                        size=sum(fsga.best_ind),
                        replace=False)
cross_validate(model,
               X_digits[:, rand],
               y_digits,
               scoring="balanced_accuracy",
               cv=5,
示例#5
0
from sklearn.datasets import make_classification
from sklearn import linear_model
from feature_selection_ga import FeatureSelectionGA
import fitness_function as ff
X, y = make_classification(n_samples=100,
                           n_features=15,
                           n_classes=3,
                           n_informative=4,
                           n_redundant=1,
                           n_repeated=2,
                           random_state=1)

model = linear_model.LogisticRegression(solver='lbfgs', multi_class='auto')
fsga = FeatureSelectionGA(model, X, y, ff_obj=ff.FitnessFunction())
pop = fsga.generate(100)

#print(pop)
示例#6
0
print('the number of unqualified after balanced:',b)
print('~~~~~')

#the eighth step: classification
model = AdaBoostClassifier()



#setting the hyper-parameters
GA_population_number = 10
GA_generation = 10
GA_crossover_probability = 0.8
GA_mutate_probability = 0.4

##the ninth step: GA
fsga = FeatureSelectionGA(model, x=X_train, y=y_train,x_test=X_test,y_test=y_test,x_development=X_development,y_development=y_development)
pop = fsga.generate(n_pop=GA_population_number, cxpb=GA_crossover_probability, mutxpb=GA_mutate_probability, ngen=GA_generation)

end_time = time.time()

print(fsga.best_ind)
list_tezheng_selected = []
for i in range(100):
    if fsga.best_ind[i] == 1:
        list_tezheng_selected.append(feature[i])

print(list_tezheng_selected)
print("---lasted %s seconds ---" % str(time.time() - start_time))


scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(X)
X_imputed_train = scaler.transform(X)
#X_imputed_test = scaler.transform(X)
X = pd.DataFrame(X_imputed_train, columns=X.columns)

cv = StratifiedKFold(10)
svm = SVC(kernel='linear',
          C=1.0,
          class_weight='balanced',
          verbose=True,
          probability=True,
          random_state=42)
viz1 = RFECV(svm, cv=cv, scoring='f1', verbose=True)
viz1.fit(X, Y)
X = X[X.columns[viz1.support_]]
print("Optimal number of features : %d" % viz1.n_features_)
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(viz1.grid_scores_) + 1), viz1.grid_scores_)
plt.show()

model = SVC(kernel='linear', C=1.0, class_weight='balanced')
fsga = FeatureSelectionGA(model, X.values, Y.values, verbose=1, cv_split=10)
pop = fsga.generate(n_pop=870, mutxpb=0.01, cxpb=0.4, ngen=10)
bestfeatsind = np.asarray(fsga.best_ind, dtype=bool)
X_GA = X[X.columns[bestfeatsind]]

X_final = pd.concat(X_GA, X_reliefF, axis=0, ignore_index=True)
示例#8
0
np.save('./tuh_processedfeature/tuh_parameter_results/rf_fpr_no_selection.npy',
        fpr)
np.save('./tuh_processedfeature/tuh_parameter_results/rf_tpr_no_selection.npy',
        tpr)
np.save('./tuh_processedfeature/tuh_parameter_results/rf_auc_no_selection.npy',
        auc)

#%%
print(X_train.shape)
clf_lr = LogisticRegression(C=1.0, penalty='l2')
clf_svm = SVC(kernel='rbf', gamma=1e-3, C=3.0)
clf_dt = RandomForestClassifier()

clf = clf_lr  #choose one clf
fsga = FeatureSelectionGA(clf, X_train, y_train)
res = fsga.generate(20, cxpb=0.5, mutxpb=0.2, ngen=10)
res = np.array(res)
#%%
np.save(
    './processedfeature_Dog_3/parameter_results_ga/svm_selection_res_Dog_3.npy',
    res)

#%%

#with selection

X_train_sel = X_train[:, res[0].astype(bool)]
X_test_sel = X_test[:, res[0].astype(bool)]

X_train_sel_sca = X_train_sel
示例#9
0
shuffled_labels = y[permutation]

num_train = 941
X_train = shuffled_dataset[0:num_train, :]
y_train = shuffled_labels[0:num_train]
num_test = 313
X_test = shuffled_dataset[num_train:num_train + num_test, :]
y_test = shuffled_labels[num_train:num_train + num_test]
X_development = shuffled_dataset[num_train + num_test:1567, :]
y_development = shuffled_labels[num_train + num_test:1567]

print(X_train.shape)
print("The number of training imbsamples:%d" % X_train.shape[0])
X_train, y_train = balance_data(X_train, y_train, methond=imb_methond)
print(X_train.shape)
print("The number of balanced training samples:%d" % X_train.shape[0])

fsga = FeatureSelectionGA(Classifier_model,
                          x=X_train,
                          y=y_train,
                          x_test=X_test,
                          y_test=y_test,
                          x_development=X_development,
                          y_development=y_development)
pop = fsga.generate(n_pop=GA_pop_num,
                    cxpb=GA_cross_prob,
                    mutxpb=GA_mutate_prob,
                    ngen=GA_pop_generation)
# Select the best individual from the final population and fit the initialized model
gene = fsga.best_ind
print_last_feature(gene)