def main(): X, y = loadDataSet() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) vec = DictVectorizer() X_train = vec.fit_transform(X_train.to_dict(orient='record')) X_test = vec.transform(X_test.to_dict(orient='record')) print('dimension: ', len(vec.feature_names_)) dt = DecisionTreeClassifier(criterion='entropy') dt.fit(X_train, y_train) print('None feature-selection: ', dt.score(X_test, y_test)) fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20) X_train_fs = fs.fit_transform(X_train, y_train) dt.fit(X_train_fs, y_train) X_test_fs = fs.transform(X_test) print('20% feature-selection: ', dt.score(X_test_fs, y_test)) # 交叉验证,使用固定百分比进行特征筛选,并作图展示 percentiles = range(1, 100, 2) results = [] for i in percentiles: fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i) X_train_fs = fs.fit_transform(X_train, y_train) scores = cross_val_score(dt, X_train_fs, y_train, cv=5) results = np.append(results, scores.mean()) print('Result: \n', results) opt = np.where(results == results.max())[0][0] print(opt) print('Opeimal number of features %d ' % percentiles[opt]) # 使用最佳筛选特征进行建模并评估 fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=7) X_train_fs = fs.fit_transform(X_train, y_train) dt.fit(X_train_fs, y_train) X_test_fs = fs.transform(X_test) s = dt.score(X_test_fs, y_test) print('The best selected: ', s) plt.plot(percentiles, results) plt.xlabel('percentiles of feature') plt.ylabel('accuracy')
def trainClassifier(year): # Use optimal parameters from train.py (vary around optimal values for chance) estimators = math.floor(np.random.uniform(70, 110, 1)) max_depth = math.floor(np.random.uniform(4, 5, 1)) max_features = math.floor(np.random.uniform(10, 30, 1)) learning_rate = np.random.uniform(5, 20, 1) / float(100) params = { "n_estimators": estimators, "max_depth": max_depth, "max_features": max_features, "learning_rate": learning_rate } # Create a Gradient Boosting Classifier from these parameters clf = GradientBoostingRegressor(**params) # Run on all training data except current year seasons = [2015, 2016, 2017, 2018, 2019] seasons.remove(year) # Build team vectors and format training data data = buildTeamVectors(seasons=seasons) X_train, y_train = formatTrainingData(data, seasons=seasons) # Normalize X_train X_train = preprocessing.normalize(X_train) # Remove columns with low correlation to label outcome selector = feature_selection.SelectPercentile( feature_selection.mutual_info_classif, percentile=50).fit(X_train, y_train) X_train = selector.transform(X_train) # Train our clf on the training data clf.fit(X_train, y_train) # Return the clf object return clf, selector
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper( columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124) )), ('scale', preprocessing.StandardScaler( with_mean=True, with_std=True )), ('select', feature_selection.SelectPercentile( percentile=59,#59, score_func=feature_selection.mutual_info_classif )), ('select', feature_selection.SelectKBest( k=101, score_func=feature_selection.f_classif )), ('estim', manifold.locally_linear_embedding( x, n_neighbors=6, n_components=101, eigen_solver='auto', method='standard' )), ]) pipe.fit_transform(x) self._model = pipe.predict
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper( columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124) )), ('scale', preprocessing.StandardScaler( with_mean=True, with_std=False )), ('reduce', decomposition.FastICA( n_components=40, fun='exp', random_state=1742, )), ('select', feature_selection.SelectPercentile( percentile=57, score_func=feature_selection.mutual_info_classif, )), ('estim', naive_bayes.GaussianNB()), ]) pipe.fit(x, y) self._model = pipe.predict
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper( columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124) )), ('scale', preprocessing.StandardScaler( with_mean=True, with_std=False )), ('select', feature_selection.SelectPercentile( percentile=73, score_func=feature_selection.f_classif )), ('estim', neighbors.KNeighborsClassifier( n_neighbors=16, weights='distance', metric='euclidean', n_jobs=-1 )) ]) pipe.fit(x, y) self._model = pipe.predict
def select_percentile(X_feature, y, percentile): selector = fs.SelectPercentile(percentile=percentile, score_func=fs.f_classif) selector.fit(X_feature, y) results = -np.log10(selector.pvalues_) X_transformed = selector.fit_transform(X_feature, y).copy() return X_transformed, results
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ #('kselect', feature_selection.SelectKBest(feature_selection.f_regression, k=115)), ('drop', transformers.ColumnDropper(columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124))), ('scale', preprocessing.StandardScaler( with_mean=True, with_std=True )), ('select', feature_selection.SelectPercentile( percentile=85,#59, score_func=feature_selection.mutual_info_classif )), ('estim', svm.NuSVC( nu=0.0525, kernel='rbf', gamma='auto', shrinking=True, class_weight=None, random_state=1742 )), ]) pipe.fit(x, y) self._model = pipe.predict
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper(columns=(6, 7, 8, 11, 12, 13, 14))), ( 'scale', preprocessing.StandardScaler( with_mean=True, with_std=False # this is not a typo! )), #('scale', preprocessing.RobustScaler( # with_centering=True, with_scaling=False, quantile_range=(1.0, 99.0) #)), ('expand', preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)), ('select', feature_selection.SelectPercentile( percentile=98, score_func=feature_selection.f_classif)), ('estim', discriminant_analysis.QuadraticDiscriminantAnalysis( reg_param=0.0043)) ]) pipe.fit(x, y) self._model = pipe.predict
def crossValidate(clf, X, y, k, percent=50): # Keep track of the performance of the model on each fold in the scores array scores = [] # Create the object to split the data skf = StratifiedKFold(n_splits=k) count = 1 # Iterate through the training and testing data from each of the k-fold splits for train_index, test_index in skf.split(X, y): # Get our training and testing data to use from the split function X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Remove columns with low correlation to label outcome selector = feature_selection.SelectPercentile( feature_selection.mutual_info_classif, percentile=percent).fit(X_train, y_train) X_train = selector.transform(X_train) # Note the columns we remove must be extracted from X_test as well here X_test = selector.transform(X_test) # Normalize data X_train = preprocessing.normalize(X_train) X_test = preprocessing.normalize(X_test) # Fit based on the training normalized data clf.fit(X_train, y_train) # Update the scores array with the performance on the testing data y_pred = clf.predict(X_test) # Move all values to binary classifications y_pred[y_pred >= 0.5] = 1 y_pred[y_pred < 0.5] = 0 # Our function for the prediction will vary depending on the metric accuracy = metrics.accuracy_score(y_test, y_pred) print(accuracy) scores.append(accuracy) count += 1 # Return the average performance across all fold splits. return np.array(scores).mean()
def process_my_christine(Xtrain, ytrain, Xval, Xtest, params): print 'ITS A MY CHRISTINE TIME !!!' t0 = time.time() modelrf = pipeline.Pipeline([ ('feature_selection', feature_selection.SelectPercentile( percentile=30, score_func=feature_selection.f_classif)), ('classification', RandomForestClassifier(n_estimators=200, random_state=1, n_jobs=params['n_jobs'])) ]) modelrf.fit(Xtrain, ytrain) print 'RF DONE' print(time.time() - t0) / 60. ytestrf = modelrf.predict_proba(Xtest)[:, 1] yvalrf = modelrf.predict_proba(Xval)[:, 1] ytestfinal = ytestrf yvalfinal = yvalrf return yvalfinal, ytestfinal
def tuning(X_train,y_train): param = { 'n_estimators': range(30, 50, 2), 'max_depth': range(2, 7, 1) } if mn == 'gbr': X_train.fillna(0, inplace = True) params = {'n_estimators': 500, 'max_depth': 4,'learning_rate': 0.01, 'loss': 'ls'} model = GradientBoostingRegressor(**params) else: model = xgb.XGBRegressor(learning_rate=0.01,n_estimators=500, max_depth=4, silent=True, objective='reg:gamma') # clf = GridSearchCV(estimator = model, param_grid = param, scoring='r2', cv=10) # clf.fit(X_train, y_train) # print(clf.grid_scores_, clf.best_params_, clf.best_score_) percentiles = range(1, 100, 2) results = [] X_train.fillna(0, inplace = True) for i in percentiles: fs = feature_selection.SelectPercentile(feature_selection.f_regression, percentile = i) X_train_fs = fs.fit_transform(X_train, y_train) scores = cross_val_score(model, X_train_fs, y_train, cv=5) results = np.append(results, scores.mean()) print(results) opt = np.where(results == results.max())[0] print(percentiles[int(opt)])
def _train(self): x = self._train_features y = self._train_outputs pipe = pipeline.Pipeline([ ('drop', transformers.ColumnDropper( columns=(0, 3, 5, 14, 26, 35, 40, 65, 72, 95, 99, 104, 124) )), ('scale', preprocessing.StandardScaler( with_mean=True, with_std=False )), ('select', feature_selection.SelectPercentile( percentile=54, score_func=feature_selection.mutual_info_classif )), ('estim', semi_supervised.LabelPropagation( kernel='rbf', alpha=0.65, n_neighbors=4, n_jobs=-1 )), ]) pipe.fit(x, y) self._transduction = pipe.named_steps['estim'].transduction_ self._model = pipe.predict
def univariate_feature_selection(option, opt, value, parser): n_samples = len(y) x = np.reshape(X, (n_samples, -1)) x = np.hstack((x, 2 * np.random.random((n_samples, 400)))) transform = feature_selection.SelectPercentile(feature_selection.f_classif) clf = Pipeline([('anova', transform), ('svc', svm.SVC(C=1.0))]) score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100) for percentile in percentiles: clf.set_params(anova__percentile=percentile) # Compute cross-validation score using 1 CPU this_scores = cross_val_score(clf, x, y, n_jobs=1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Performance of the SVM-Anova varying the percentile of features selected' ) plt.xlabel('Percentile') plt.ylabel('Prediction rate') plt.axis('tight') plt.show()
def comput_coefs(self, X, y, size): cv = KFold(2) # cross-validation generator for model selection ridge = BayesianRidge() cachedir = tempfile.mkdtemp() mem = Memory(cachedir=cachedir, verbose=1) # Ward agglomeration followed by BayesianRidge connectivity = grid_to_graph(n_x=size, n_y=size) ward = FeatureAgglomeration(n_clusters=10, connectivity=connectivity, memory=mem) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size) # Anova univariate feature selection followed by BayesianRidge f_regression = mem.cache(feature_selection.f_regression) # caching function anova = feature_selection.SelectPercentile(f_regression) clf = Pipeline([('anova', anova), ('ridge', ridge)]) # Select the optimal percentage of features with grid search clf = GridSearchCV(clf, {'anova__percentile': [5, 10, 20]}, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator_.steps[-1][1].coef_ coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_.reshape(1, -1)) coef_selection_ = coef_.reshape(size, size) return dict( coef_selection_=coef_selection_, coef_agglomeration_=coef_agglomeration_, cachedir=cachedir )
def svm_classifier(X, y, is_default=True): from sklearn.svm import SVC if is_default: model = SVC(probability=True) model.fit(X, y) return model else: param_grid = { 'kernel': ('rbf'), 'C': [1e-2, 1e-1, 1, 10], 'gamma': [1e-4, 1e-3, 1e-2] } fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20) x_train_fs = fs.fit_transform(X, y) model = SVC(probability=True) grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1) grid_search.fit(x_train_fs, y_train) best_parameters = grid_search.best_estimator_.get_params() # model with the best parameters model = SVC(kernel=best_parameters['kernel'], C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True) model.fit(x_train_fs, y_train) return model
def feature_select_per(features_train, target_train, features_test, target_test, est, lr, depth, subsample, colsamplebt): # Now after the model has been tuned, use percentile to do feature selection from sklearn import feature_selection acc_list_train = [] acc_list_test = [] per_list = [] percentile = range(1, 101) #range(10,100) #percentile = [22] # identify the percentile that will produce the best results for per in percentile: # intilaize SelectFromModel using thresh fs = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=per) feature_model = fs.fit(features_train, target_train) features_train_new = feature_model.transform(features_train) features_test_new = feature_model.transform(features_test) xgb = xgboost.XGBClassifier(n_estimators=est, learning_rate=lr, gamma=0, subsample=subsample, colsample_bytree=colsamplebt, max_depth=depth) xgb.fit(features_train_new, target_train) pred_test = xgb.predict(features_test_new) pred_train = xgb.predict(features_train_new) predictions_train = [round(value) for value in pred_train] predictions_test = [round(value) for value in pred_test] train_accuracy = accuracy_score(target_train, predictions_train) test_accuracy = accuracy_score(target_test, predictions_test) print(per) print(train_accuracy) print(test_accuracy) per_list.append(per) acc_list_train.append(train_accuracy) acc_list_test.append(test_accuracy) per_results = pd.DataFrame({ 'per': per_list, 'acc_train': acc_list_train, 'acc_test': acc_list_test }) per_results.to_csv('per_results.csv') return per_results
def rf_classifier(X, y, is_default=True): from sklearn.ensemble import RandomForestClassifier if is_default: model = RandomForestClassifier(probability=True) model.fit(X, y) return model else: param_grid = { 'n_estimators': range(10, 100, 10), 'max_features': np.linspace(0.5, 0.9, num=5).tolist(), 'max_depth': [10, 50, None], } fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20) x_train_fs = fs.fit_transform(X, y) model = RandomForestClassifier(probability=True) grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1) grid_search.fit(x_train_fs, y_train) best_parameters = grid_search.best_estimator_.get_params() # model with the best parameters model = RandomForestClassifier( n_estimators=best_parameters['n_estimators'], max_features=best_parameters['max_features'], max_depth=best_parameters['max_depth'], probability=True) model.fit(x_train_fs, y_train) return model
def run_select_percentile(file_path): """ Returns a list of selected feature names. :param file_path: Path for the training matrix of extracted features to select from :return: List of selected feature names """ # Setting up dataset data = pd.read_csv(file_path) x_train = data.drop('Label', axis=1) y_train = data['Label'] # Select features according to a percentile of the highest scores. feature_selector = fs.SelectPercentile(score_func=fs.f_classif, percentile=10) feature_selector.fit_transform(x_train, y_train) mask = feature_selector.get_support() # List of all feature names feature_names = list(data.columns.values) # List of selected feature names new_feature_names = [] for feature_is_selected, feature in zip(mask, feature_names): if feature_is_selected: new_feature_names.append(feature) return new_feature_names
def main(): X, Y = loadData() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=33) vec = DictVectorizer() X_train = vec.fit_transform(X_train.to_dict(orient='record')) X_test = vec.transform(X_test.to_dict(orient='record')) print(len(vec.feature_names_)) dt = DecisionTreeClassifier(criterion='entropy') dt.fit(X_train, Y_train) print(dt.score(X_test, Y_test)) fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20) X_train_fs = fs.fit_transform(X_train, Y_train) dt = DecisionTreeClassifier(criterion='entropy') dt.fit(X_train_fs, Y_train) X_test_fs = fs.transform(X_test) print(dt.score(X_test_fs, Y_test)) percentiles = range(1, 100, 2) results = [] for i in percentiles: fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i) X_test_fs = fs.fit_transform(X_train, Y_train) scores = cross_val_score(dt, X_train_fs, Y_train, cv=5) results = np.append(results, scores.mean()) print(results) opt = np.where(results == results.max())[0] print(opt) print('Optimal number of features %d' % percentiles[opt[0]]) pl.plot(percentiles, results) pl.xlabel('percent of features') pl.ylabel('accuracy') pl.show() fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=percentiles[opt[0]]) X_train_fs = fs.fit_transform(X_train, Y_train) # dt = DecisionTreeClassifier(criterion='entropy') dt.fit(X_train_fs, Y_train) X_test_fs = fs.transform(X_test) print(dt.score(X_test_fs, Y_test))
def main(): train = read_bagofwords_dat(file_train, num_train) test = read_bagofwords_dat(file_test, num_test) train_target = [] for i in range(0, num_train): if i < num_train/2: train_target.append(0) #notspam else: train_target.append(1) #spam test_target = [] for i in range(0, num_test): if i < num_test/2: test_target.append(0) #notspam else: test_target.append(1) #spam if select_features: selector = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=percentile) #selector = feature_selection.SelectKBest(feature_selection.f_classif, k = 10) train = selector.fit_transform(train, train_target) test = selector.transform(test) #mask = selector.get_support() #print_features(mask) print ("Finished doing %d percentile feature selection" % (percentile)) classifiers = [ #(svm.LinearSVC(), "SVML"), #(GaussianNB(), "Gaussian"), #(MultinomialNB(1.0, False, class_prior), "Multinomial"), #(BernoulliNB(1.0, freq_cutoff, False, class_prior), "Bernoulli"), #(tree.DecisionTreeClassifier(), "Decision Tree"), (AdaBoostClassifier(base_estimator = tree.DecisionTreeClassifier(max_depth=3), n_estimators = rounds), "Adaboost with %d rounds and max-depth 3 decision tree" % (rounds)) ] for (classifier, name) in classifiers: model = classifier.fit(train, train_target) #y_pred = model.predict(test) if name == "SVML": y_scores = model.decision_function(test) else: y_scores = model.predict_proba(test)[:,1] #FP = 0 #FN = 0 #TP = 0 #for i in range(0, num_test): # if y_pred[i] == "spam" and test_target[i] == "notspam": # FP+=1 # if y_pred[i] == "notspam" and test_target[i] == "spam": # FN+=1 # if y_pred[i] == "spam" and test_target[i] == "spam": # TP+=1 #print("%s: FP %d, FN %d, TP %d " % (name, FP, FN, TP)) print("%s: AUC %f" % (name, roc_auc_score(test_target, y_scores)))
def simon_pipeline(simon_transformer, percentile): return Pipeline([ ('simon', simon_transformer), ('scale', MinMaxScaler(feature_range=(-1, 1))), ('percent', feature_selection.SelectPercentile(feature_selection.f_classif, percentile=percentile)), ])
def get_percentile_columns(X, y, percentile=2, score_func=None): """ Method to fetch columns based on SelectPercentile feature selection """ selector = feature_selection.SelectPercentile(score_func=score_func, percentile=percentile).fit( X, y) return X.columns[selector.get_support()]
def optimal_features_select_from_data(X_train, X_test, Y_train, Y_test): optimal_percentil, results, pecentils = optimal_percentile_find( X_train, X_test, Y_train, Y_test) fs = feature_selection.SelectPercentile( feature_selection.chi2, percentile=pecentils[optimal_percentil]) X_train_fs = fs.fit_transform(X_train, Y_train) #fit_transform区别 X_test_fs = fs.transform(X_test) #transform区别 return X_train_fs, X_test_fs
def percentile_filter(X, y, percentile=20): selector = fs.SelectPercentile(fs.chi2, percentile=percentile) selector.fit(X, y) # features = selected_features(selector, feature_names) # log('Percentile', len(features)) # log('X', xt.shape) # return pd.DataFrame(xt, columns=features, index=X.index), selector return selector
def train_pair(train_set, resize_img=300, num_id=5, num_img_id=10, min_sample=False, params=None): train_labels = [] train_hists = [] for item in train_set: print(item) train_labels.append(item[0]) image1_p = item[1][0] image2_p = item[1][1] image1 = cv.imread(image1_p, 0) image2 = cv.imread(image2_p, 0) image1_hist = face_descriptors.get_orb_histograms(image1, resize_img) image2_hist = face_descriptors.get_orb_histograms(image2, resize_img) train_hists.append(np.concatenate((image1_hist, image2_hist))) anova_filter = feature_selection.SelectPercentile( feature_selection.f_classif) if params is not None: c, gamma = params models = [] for c_value in c: for g_value in gamma: # Initialize SVM print("Training SVM: c ", c_value, ",gamma ", g_value) clf = SVC(kernel='rbf', decision_function_shape='ovr', C=c_value, gamma=g_value, class_weight='balanced', probability=True, verbose=5) clf = make_pipeline(anova_filter, clf) # fit the model clf.fit(train_hists, train_labels) models.append([clf, c_value, g_value]) return models else: clf = SVC(kernel='linear', decision_function_shape='ovr', C=1, class_weight='balanced', probability=True, verbose=5) clf = make_pipeline(anova_filter, clf) # fit the model clf.fit(train_hists, train_labels) return clf
def plot_BestKFeatures(X_train, y_train): ''' http://nbviewer.ipython.org/github/gmonce/scikit-learn-book/blob/master/Chapter%204%20-%20Advanced%20Features%20-%20Feature%20Engineering%20and%20Selection.ipynb Find the best percentile of features to use, using cross-validation on the training set and get K best feats ''' from sklearn import cross_validation from sklearn import feature_selection from sklearn import tree dt = tree.DecisionTreeClassifier(criterion='entropy') dt = RandomForestClassifier(n_jobs=2, bootstrap=True, n_estimators=250, criterion='gini') dt = dt.fit(X_train, y_train) percentiles = range(1, 95, 5) results = [] for i in range(1, 95, 5): fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i) #Original fs = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=i) # alt X_train_fs = fs.fit_transform(X_train, y_train) scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=4) #print i,scores.mean() results = np.append(results, scores.mean()) optimal_percentil = np.where(results == results.max())[0] print(("Optimal number of features:{0}".format( percentiles[optimal_percentil])), "\n") # Plot number of features VS. cross-validation scores import pylab as pl import matplotlib.pylab as pl pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation accuracy)") pl.plot(percentiles, results) print("Mean scores:", results) return
def get_fs_model(model, method, train, target=None, cv=None): """Connects given model with specified feature selection method and trains the final structure. """ if method == "RFE": model = fs_scikit.RFE(model, 2, step=5) if target is not None: return model.fit(train, target) else: return model.fit(train) if method == "RFECV": model = fs_scikit.RFECV(model, 3, cv=cv) if target is not None: return model.fit(train, target) else: return model.fit(train) elif method == "linearSVC": sel = SelectFromModel(LinearSVC(penalty='l1', dual=False)) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "fromModel": fm = fs_scikit.SelectFromModel(model) if target is not None: fm.fit(train, target) else: fm.fit(train) model = Pipeline([('feature_selection', fm), ('data_mining', model)]) # elif method == "Anova": # ANOVA SVM-C # anova_filter = fs_scikit.SelectKBest(f_regression, k=5) # model = Pipeline([ # ('feature_selection', anova_filter), # ('data_mining', model) # ]) elif method == "VarianceThreshold": sel = fs_scikit.VarianceThreshold(threshold=(.8 * (1 - .8))) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectPercentile": sel = fs_scikit.SelectPercentile(fs_scikit.f_classif, percentile=30) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFpr": sel = fs_scikit.SelectFpr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFdr": sel = fs_scikit.SelectFdr(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "SelectFwe": sel = fs_scikit.SelectFwe(alpha=0.2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) elif method == "ch2": sel = fs_scikit.SelectKBest(fs_scikit.chi2, k=2) model = Pipeline([('feature_selection', sel), ('data_mining', model)]) else: print("Feature selection method was not found: " + method) sys.exit(1) return model
def test_features(my_dataset, features_list): ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys=True) labels, features = targetFeatureSplit(data) from sklearn import feature_selection fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=20) X_train_fs = fs.fit_transform(list(map(abs, features)), labels) print[features_list[i] for i in np.argsort(fs.scores_)[::-1]] test_code(features, labels)
def percentile_k_features(data,k=20): data1=pd.DataFrame(data) data2=data1.to_dict() X = pd.DataFrame(data.iloc[:,:-1]) y = pd.DataFrame(data['SalePrice']) X1=X.columns.values y1=['SalesPrice'] transformer = feature_selection.SelectPercentile(f_regression,percentile=20).fit_transform(X, y) dataframep=pd.DataFrame(transformer) list=['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath'] return(list)
def train_imbalance( descr_series: Series, classes_codes: Series, TFIDF_, IMB_, FS_, req_percentage: int, CLF_, model_name: str, ) -> tuple: """Trains models using handled setting and saves them as .sav objects. Parameters: ---------- instance: Instance of User model. descr_series: description series. classes_codes: series with classes' codes. TFIDF_: vectorizer. IMB_: SMOTE instance. FS_: ranking terms method. req_percentage: percentage to be taken from the ranked list. CLF_: classifier. model_name: models name. Returns: ---------- Trained model in byte representation associated to its model name. """ transformer = feature_selection.SelectPercentile(FS_) clf_model = Pipeline([("tfidf", TFIDF_), ("imba", IMB_), ("fs", transformer), ("clf", CLF_)]) best_params = get_best_params(clf_model, descr_series, classes_codes) print(f"{model_name}:{best_params}") clf_model.set_params( fs__percentile=req_percentage, clf__C=best_params["clf__C"], clf__gamma=best_params["clf__gamma"], ).fit(descr_series, classes_codes) return {model_name: clf_model}, {model_name: best_params}