def test_feature_selection_ga(model, features, target): fsga = FeatureSelectionGA(model, features, target, ff_obj=FitnessFunction(2)) result = fsga.generate(5) assert 5 == len(result)
def ga(target, firma, control): print("Ejecutando Genetic Algorithm...") # Grupo control 2014 ################################### x = firma y = control.loc[:, target] model = linear_model.LinearRegression() ff = CustomFitnessFunctionClass(n_total_features=x.shape[1], n_splits=3, alpha=0.05) fsga = FeatureSelectionGA(model, x, y, ff_obj=ff) pop = fsga.generate(5000) print(pop) return
skf = StratifiedKFold(n_splits=self.n_splits) for train_index, test_index in skf.split(x, y): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] if x_train.shape[0] != y_train.shape[0]: raise Exception() model.fit(x_train, y_train) predicted_y = model.predict(x_test) cv_set[test_index] = predicted_y P = accuracy_score(y, cv_set) fitness = (alpha * (1.0 - P) + (1.0 - alpha) * (1.0 - (x.shape[1]) / total_features)) return fitness X, y = make_classification(n_samples=100, n_features=15, n_classes=3, n_informative=4, n_redundant=1, n_repeated=2, random_state=1) model = linear_model.LogisticRegression(solver='lbfgs', multi_class='auto') ff = CustomFitnessFunctionClass(n_total_features=X.shape[1], n_splits=3, alpha=0.05) fsga = FeatureSelectionGA(model, X, y, ff_obj=ff) pop = fsga.generate(1000)
from sklearn.linear_model import LogisticRegression from feature_selection_ga import FeatureSelectionGA from sklearn import datasets from sklearn.model_selection import train_test_split, cross_validate digits = datasets.load_digits() X_digits = digits.data y_digits = digits.target x_train, x_test, y_train, y_test = train_test_split(X_digits, y_digits, test_size=0.2) model = LogisticRegression(multi_class="multinomial", solver="newton-cg") fsga = FeatureSelectionGA(model, x_train, y_train, verbose=1) pop = fsga.generate(10) # compare best individual with random individual cross_validate(model, X_digits[:, np.array(fsga.best_ind) == 1], y_digits, scoring="balanced_accuracy", cv=5, return_train_score=True) rand = np.random.choice(X_digits.shape[1], size=sum(fsga.best_ind), replace=False) cross_validate(model, X_digits[:, rand], y_digits, scoring="balanced_accuracy", cv=5,
from sklearn.datasets import make_classification from sklearn import linear_model from feature_selection_ga import FeatureSelectionGA import fitness_function as ff X, y = make_classification(n_samples=100, n_features=15, n_classes=3, n_informative=4, n_redundant=1, n_repeated=2, random_state=1) model = linear_model.LogisticRegression(solver='lbfgs', multi_class='auto') fsga = FeatureSelectionGA(model, X, y, ff_obj=ff.FitnessFunction()) pop = fsga.generate(100) #print(pop)
print('the number of unqualified after balanced:',b) print('~~~~~') #the eighth step: classification model = AdaBoostClassifier() #setting the hyper-parameters GA_population_number = 10 GA_generation = 10 GA_crossover_probability = 0.8 GA_mutate_probability = 0.4 ##the ninth step: GA fsga = FeatureSelectionGA(model, x=X_train, y=y_train,x_test=X_test,y_test=y_test,x_development=X_development,y_development=y_development) pop = fsga.generate(n_pop=GA_population_number, cxpb=GA_crossover_probability, mutxpb=GA_mutate_probability, ngen=GA_generation) end_time = time.time() print(fsga.best_ind) list_tezheng_selected = [] for i in range(100): if fsga.best_ind[i] == 1: list_tezheng_selected.append(feature[i]) print(list_tezheng_selected) print("---lasted %s seconds ---" % str(time.time() - start_time))
scaler = MinMaxScaler(feature_range=(-1, 1)) scaler.fit(X) X_imputed_train = scaler.transform(X) #X_imputed_test = scaler.transform(X) X = pd.DataFrame(X_imputed_train, columns=X.columns) cv = StratifiedKFold(10) svm = SVC(kernel='linear', C=1.0, class_weight='balanced', verbose=True, probability=True, random_state=42) viz1 = RFECV(svm, cv=cv, scoring='f1', verbose=True) viz1.fit(X, Y) X = X[X.columns[viz1.support_]] print("Optimal number of features : %d" % viz1.n_features_) plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(viz1.grid_scores_) + 1), viz1.grid_scores_) plt.show() model = SVC(kernel='linear', C=1.0, class_weight='balanced') fsga = FeatureSelectionGA(model, X.values, Y.values, verbose=1, cv_split=10) pop = fsga.generate(n_pop=870, mutxpb=0.01, cxpb=0.4, ngen=10) bestfeatsind = np.asarray(fsga.best_ind, dtype=bool) X_GA = X[X.columns[bestfeatsind]] X_final = pd.concat(X_GA, X_reliefF, axis=0, ignore_index=True)
np.save('./tuh_processedfeature/tuh_parameter_results/rf_fpr_no_selection.npy', fpr) np.save('./tuh_processedfeature/tuh_parameter_results/rf_tpr_no_selection.npy', tpr) np.save('./tuh_processedfeature/tuh_parameter_results/rf_auc_no_selection.npy', auc) #%% print(X_train.shape) clf_lr = LogisticRegression(C=1.0, penalty='l2') clf_svm = SVC(kernel='rbf', gamma=1e-3, C=3.0) clf_dt = RandomForestClassifier() clf = clf_lr #choose one clf fsga = FeatureSelectionGA(clf, X_train, y_train) res = fsga.generate(20, cxpb=0.5, mutxpb=0.2, ngen=10) res = np.array(res) #%% np.save( './processedfeature_Dog_3/parameter_results_ga/svm_selection_res_Dog_3.npy', res) #%% #with selection X_train_sel = X_train[:, res[0].astype(bool)] X_test_sel = X_test[:, res[0].astype(bool)] X_train_sel_sca = X_train_sel
shuffled_labels = y[permutation] num_train = 941 X_train = shuffled_dataset[0:num_train, :] y_train = shuffled_labels[0:num_train] num_test = 313 X_test = shuffled_dataset[num_train:num_train + num_test, :] y_test = shuffled_labels[num_train:num_train + num_test] X_development = shuffled_dataset[num_train + num_test:1567, :] y_development = shuffled_labels[num_train + num_test:1567] print(X_train.shape) print("The number of training imbsamples:%d" % X_train.shape[0]) X_train, y_train = balance_data(X_train, y_train, methond=imb_methond) print(X_train.shape) print("The number of balanced training samples:%d" % X_train.shape[0]) fsga = FeatureSelectionGA(Classifier_model, x=X_train, y=y_train, x_test=X_test, y_test=y_test, x_development=X_development, y_development=y_development) pop = fsga.generate(n_pop=GA_pop_num, cxpb=GA_cross_prob, mutxpb=GA_mutate_prob, ngen=GA_pop_generation) # Select the best individual from the final population and fit the initialized model gene = fsga.best_ind print_last_feature(gene)