def recursiveFeatureSelectorCV(classifier_model,train_data,train_labels,test_data,number_of_features): rfe = RFECV(classifier_model,number_of_features) transformed_train_data = rfe.fit_transform(train_data,train_labels) transformed_test_data = rfe.transform(test_data) return transformed_train_data,transformed_test_data
def lr_with_fs(): """ Submission: lr_with_fs_0703_01.csv E_val: E_in: E_out: """ from sklearn.linear_model import LogisticRegressionCV, LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV import pylab as pl X, y = dataset.load_train() raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) pkl_path = Path.of_cache('lr_with_fs.RFECV.pkl') rfe = IO.fetch_cache(pkl_path) if rfe is None: rfe = RFECV(estimator=LogisticRegression(class_weight='auto'), cv=StratifiedKFold(y, 5), scoring='roc_auc') rfe.fit(X_scaled, y) IO.cache(rfe, pkl_path) print("Optimal number of features : %d" % rfe.n_features_) # Plot number of features VS. cross-validation scores pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation score (AUC)") pl.plot(range(1, len(rfe.grid_scores_) + 1), rfe.grid_scores_) pl.savefig('lr_with_fs.refcv') X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print('CV scores: %s' % clf.scores_) print('Ein: %f' % Util.auc_score(clf, X_new, y)) IO.dump_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs_0703_01')
def rfecv_selection(self, x_train, y_train, cvfolds=5, feature_names=None, svm_kernel='linear', rf_n_esimators=20, estimator='svm', log_root=None, log_file=None): # Recursive feature elimination with SVM or Random Forest # Input: Training/target nparrays and estimator params # Output: RFECV object; selected features as nparray; (array of selected/dropped feats) if estimator == 'svm': clf = SVC(kernel = svm_kernel) elif estimator == 'rf': clf = RandomForestClassifierWithCoef(n_estimators = rf_n_esimators) elif estimator == 'mnb': clf = MultinomialNB() rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, cvfolds), scoring='accuracy') rfecv.fit(x_train, y_train) if log_root and log_file: log_text = '\nOptimal number of features: '+str(rfecv.n_features_) with open(log_file, 'a') as log: log.write(log_text) else: print("\nOptimal number of features : %d" % rfecv.n_features_) x_train_rerf = rfecv.transform(x_train) if feature_names is not None: ranked_feats = rfecv.ranking_ selected_feats = [feature_names[ix] for ix in range(0,len(ranked_feats)) if ranked_feats[ix]==1] dropped_feats = [feature_names[ix] for ix in range(0,len(ranked_feats)) if ranked_feats[ix]!=1] if log_root and log_file: log_text = '\n\nSelected Features: \n' with open(log_file, 'a') as log: log.write(log_text) with open(log_file, 'a') as log: for item in selected_feats: log.write(item+', ') log_text = '\n\nDropped Features: \n' with open(log_file, 'a') as log: log.write(log_text) with open(log_file, 'a') as log: for item in dropped_feats: log.write(item+', ') else: print('\nSelected Features: {0}\n'.format(selected_feats)) print('\nDropped Features: {0}\n'.format(dropped_feats)) return rfecv, x_train_rerf, rfecv.n_features_, selected_feats, dropped_feats else: return rfecv, x_train_rerf, rfecv.n_features_
def recursive_feature_elimination(self, x: np.ndarray, y: np.ndarray, clf=None) -> np.ndarray: selector = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y), scoring='accuracy', verbose=True) print("begin eliminate") selector.fit(x, y) print("Optimal number of features : %d" % selector.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_) plt.show() selected_features = self.features[selector.get_support()] print(selected_features) x = selector.transform(x) return x
full_df = pd.concat([train_df.drop(['SalePrice'], axis=1), test_df]) idx_split = train_df.shape[0] full_df = preprocessing(full_df) full_df = StandardScaler().fit_transform(full_df) #print(len(categorical_features)+len(ordinal_features)+ len(numerical_features) + len(bin_features)) from sklearn.ensemble import GradientBoostingRegressor from sklearn.feature_selection import RFECV from sklearn.pipeline import Pipeline estimator = GradientBoostingRegressor() model = RFECV(estimator, step=1, cv=5) model.fit(full_df[:idx_split, :], y_train) full_df = model.transform(full_df) # clf = Pipeline([ # ('feature_selection', RFECV(ExtraTreesRegressor())), # ('classification', ExtraTreesRegressor()) # ]) X_train = full_df[:idx_split, :] print(X_train.shape) s = find_seed(X_train, y_train, 64) print(score(X_train, y_train, seed=s, estim=64)) clf = GradientBoostingRegressor(n_estimators=64, random_state=s) clf.fit(X_train, y_train) y_test = clf.predict(full_df[idx_split:, :]) print(y_test)
def main(): X = select_features(raw_data, features) y = raw_data['diagnosis'].copy() if features != 'all': print('') print("--------------------------------------------------------") print("Displaying correlation matrix for the selected features") print("--------------------------------------------------------") corr_mtrx(X) print('') print("-------------------------------------------------------") print("Displaying dispersion matrix for the selected features") print("-------------------------------------------------------") X_plot = X.copy() X_plot['diagnosis'] = raw_data['diagnosis'].copy() corr_plot(X_plot) print('') # Scaling the X features so they range between -1 and 1 with an average value of 0 # NOTE: applying StandardScaler() transforms the pandas dataframe into a numpy array, # which is problematic when we want to use pandas specific attributes like .columns # from sklearn.preprocessing import StandardScaler # sc = StandardScaler() # X_scaled = sc.fit_transform(X) X_scaled = (X - X.mean()) / (X.std()) # Transform the 'diagnosis' column so the values are numerical (1=Malignant, 0=Benign) y = y.map({'M': 1, 'B': 0}) ## ==================== 4. RANDOMLY SPLITTING THE DATA INTO TRAINING AND TEST SETS ==================== ## from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.30, random_state=0) ## ==================== 5. CONSTRUCTING THE MODELS ==================== ## print("==============================================================") print("= STEP 1: CONSTRUCTING MODELS WITH ALL THE SELECTED FEATURES =") print("==============================================================") from sklearn import linear_model # Logistic regression logreg = linear_model.LogisticRegression(random_state=0) logreg_params = {'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300]} from sklearn import tree # Decision tree dtree = tree.DecisionTreeClassifier(criterion='entropy', max_features='sqrt', random_state=0) dtree_params = {'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10]} from sklearn.model_selection import GridSearchCV # Grid search (parameters optimization) from sklearn import metrics models_dict = {logreg: logreg_params, dtree: dtree_params} for model, params in models_dict.items(): # Loop over the classifiers clf = GridSearchCV(model, params, cv=5, scoring='precision') clf.fit(X_train, y_train) print("Classifier:") print(clf.best_estimator_) print('') print("Cross-validation: searching for the best parameters...\n") print("Best fit parameters: ", clf.best_params_) print("Precision score obtained on the training set: %.2f" % clf.best_score_) prediction = clf.predict(X_test) print("Precision score obtained on the test set: %.2f" % metrics.precision_score(prediction, y_test)) print('') print("--------------------------------------------------------") input("Program paused, press Enter to continue...\n") ## ==================== 6. FEATURES SELECTION ==================== ## print("=================================================================") print("= STEP 2: CONSTRUCTING MODELS WITH NON-CORRELATED FEATURES ONLY =") print("=================================================================") # Here, we shall perform the same computations than in Step 1 with a reduced number # of features, eliminating features that are correlated to other ones. X_filt = X_scaled.filter(regex='^(radius_|concave points_|texture_|smoothness_|symmetry_)\D*$', axis=1) X_train2, X_test2, y_train2, y_test2 = train_test_split(X_filt, y, test_size=0.30, random_state=0) print("The features retained for this step are: ") print(X_filt.columns) print('') print("--------------------------------------------------------") for model, params in models_dict.items(): # Loop over the classifiers clf = GridSearchCV(model, params, cv=5, scoring='precision') clf.fit(X_train2, y_train2) print("Classifier: ") print(clf.best_estimator_) print('') print("Cross-validation: searching for the best parameters...\n") print("Best fit parameters: ", clf.best_params_) print("Precision score obtained on the training set: %.2f" % clf.best_score_) prediction = clf.predict(X_test2) print("Precision score obtained on the test set: %.2f" % metrics.precision_score(prediction, y_test2)) print('') print("--------------------------------------------------------") input("Program paused, press Enter to continue...\n") # Step 3 only works on the whole dataset for now if features == 'all' print("====================================================================") print("= STEP 3: CONSTRUCTING MODELS WITH THE MOST 'SIGNIFICANT' FEATURES =") print("====================================================================") # Applying RFE method with cross validation to i) classify features from best to worst and # ii) find the optimal number of features to use from sklearn.feature_selection import RFECV clf1 = linear_model.LogisticRegression(random_state=0) # Creating a new regressor with default parameters rfecv = RFECV(estimator=clf1, step=1, cv=5, scoring='precision') rfecv = rfecv.fit(X_train, y_train) print("Classifier:") print(rfecv.estimator_) print('') print("Optimal number of features : %d" % rfecv.n_features_) # With C=1.0, the optimal number of features is 20 top_feat1 = pd.Series(rfecv.grid_scores_[:rfecv.n_features_], index=X_train.columns[rfecv.support_]).sort_values(ascending=False) print("Best feature rankings and precision scores:") print(top_feat1) print('') X_train_best = rfecv.transform(X_train) # Reshaping X_train in order to keep the top 20 features X_test_best = rfecv.transform(X_test) # Reshaping X_test in order to keep the top 20 features clf1.fit(X_train_best, y_train) predictrain1 = clf1.predict(X_train_best) # Computing predicted values on the training set predictest1 = clf1.predict(X_test_best) # Computing predicted values on the test set print("Precision score obtained on the training set: %.2f" % metrics.precision_score(predictrain1, y_train)) print("Precision score obtained on the test set: %.2f" % metrics.precision_score(predictest1, y_test)) print('') print("--------------------------------------------------------") # The decision tree classifier has inherently an attribute estimating feature importances; we shall try it clf2 = tree.DecisionTreeClassifier(max_features='sqrt', random_state=0) # Creating a new tree with default parameters clf2.fit(X_train, y_train) top_feat2 = pd.Series(clf2.feature_importances_, index=X_train.columns).sort_values(ascending=False) print("Classifier:") print(rfecv.estimator_) print('') print("Feature rankings and Gini scores:") print(top_feat2) # List of all the features having a Gini score of 0.0 droplist = ['area_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'fractal_dimension_worst', 'fractal_dimension_mean', 'radius_se', 'perimeter_se', 'symmetry_worst', 'compactness_se', 'concave points_se', 'smoothness_worst', 'smoothness_se'] print('') print("Computing a new decision tree without the following features:") print(droplist) print('') X_filtrain = X_train.drop(droplist, axis=1) # Removing all the features in dropllist from the training set X_filtest = X_test.drop(droplist, axis=1) # Removing all the features in dropllist from the test set clf2.fit(X_filtrain, y_train) predictrain2 = clf2.predict(X_filtrain) # Computing predicted values on the training set predictest2 = clf2.predict(X_filtest) # Computing predicted values on the test set print("Precision score obtained on the training set: %.2f" % metrics.precision_score(predictrain2, y_train)) print("Precision score obtained on the test set: %.2f" % metrics.precision_score(predictest2, y_test))
from __future__ import division import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import RFECV X = np.load("../feats/train_formatted.npy") y = np.load("../feats/train_y.npy") X_test = np.load("../feats/test_formatted.npy") y_test = np.load("../feats/test_y.npy") clf = LogisticRegression() selector = RFECV(clf) selector.fit(X, y) X = selector.transform(X) X_test = selector.transform(X_test) scores = selector.ranking_ print 'Index : score' sortedIdx = [i[0] for i in sorted(enumerate(scores), key=lambda x: x[1])] top = 384 for i in range(top): print str(sortedIdx[i]) + ' : ' + str(scores[sortedIdx[i]]) clf.fit(X, y) pred = clf.predict(X_test) accuracy = sum(pred == y_test) / y_test.size print 'Logistic Regression Accuracy: ' + str(accuracy)
else: classifier = RandomForestClassifier(n_estimators=200) if FEATURE_SELECTION: print("Before FS:", X.shape[1]) tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) i = 0 for train, test in cv.split(X, y): if FEATURE_SELECTION: selector = RFECV(estimator, step=1, cv=3, scoring='roc_auc') selector = selector.fit(X[train], y[train]) X_r = selector.transform(X) print("After FS" + str(i + 1) + ":", X_r.shape[1]) else: X_r = X # Fit classifier classifier.fit(X_r[train], y[train]) # Grid search output if GRID_SEARCH: print("Grid scores on development set:") means = classifier.cv_results_['mean_test_score'] stds = classifier.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, classifier.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
__author__ = 'jeronicarandellsaladich' # Recursive Feature Elimination from sklearn.datasets import make_friedman1 from sklearn.feature_selection import RFECV from sklearn.svm import SVR X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) print X estimator = SVR(kernel="linear") selector = RFECV(estimator, step=5) selector = selector.fit(X, y) print selector.support_ print selector.ranking_ print selector.transform(X)
import itertools from sklearn.feature_selection import RFECV from contemppoetry import * print('******RFECV-LogisticRegression') for penalty, C in itertools.product(['l1', 'l2'], PARAM_RANGE): rfe = RFECV(estimator=LogisticRegression(penalty=penalty, C=C), scoring='accuracy', cv=5) rfe.fit(X, y) # list selected features by rank print( [feature_names[i] for i in np.argsort(rfe.ranking_) if rfe.support_[i]] ) pipe = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression())]) param_grid = {'clf__penalty': ['l1', 'l2'], 'clf__C': PARAM_RANGE} gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='accuracy', cv=5) my_print(*my_cross_val_score(gs, X=rfe.transform(X), y=y, gs=False))
def lin_model(labelled_data, unlabelled_data): """ Parameters: training dataframe, unknown dataframe Returns: results dataframe (Instance, Income) Drops NaN from training data, Replaces NaN in test data with ffill, target-encodes non-numeric fields, scales values, 80/20 splits data to help verify model, selects features using RFECV, with a lasso mode, cv set to 5, uses KNeighborRegressor for 11 nearest neighbours weighted to distance """ print("cleaning data...") clean_labelled = labelled_data.dropna() clean_unlabelled = unlabelled_data[all_columns] # not ideal but fillna the mean freezes for some reason clean_unlabelled = clean_unlabelled.fillna(method="ffill") # clean_unlabelled = clean_unlabelled.fillna("None") # remove some columns # clean_labelled = drop_columns(clean_labelled) # clean_unlabelled = drop_columns(clean_unlabelled) # print("one hot encoding data...") # One hot encoding # ohe = OneHotEncoder( # categories="auto", # handle_unknown="ignore", # sparse=False # ) # clean_labelled = encode_training(ohe, clean_labelled) # clean_unlabelled = encode_testing(ohe, clean_unlabelled) clean_labelled = constrain_col_vals(clean_labelled) clean_unlabelled = constrain_col_vals(clean_unlabelled) unknown_data = clean_unlabelled.drop(["Instance"], axis=1) print("splitting data into train and test...") # 80/20 split split = split_data(clean_labelled) train_data, train_target, test_data, test_target = split print("target encoding data...") # Target encoding tar_encode = TargetEncoder() train_data = tar_encode.fit_transform(train_data, train_target) test_data = tar_encode.transform(test_data) unknown_data = tar_encode.transform(unknown_data) print("scaling values...") # scaling values scaler = StandardScaler() train_data = scaler.fit_transform(train_data) test_data = scaler.transform(test_data) unknown_data = scaler.transform(unknown_data) print("selecting features...") # feature selection lasso = lm.Lasso() selector = RFECV(lasso, cv=5) train_data = selector.fit_transform(train_data, train_target) test_data = selector.transform(test_data) unknown_data = selector.transform(unknown_data) print("fitting model...") # fit model # lasso = lm.LassoCV(cv=5) # lasso.fit(train_data, train_target) neigh = KNeighborsRegressor( n_neighbors=11, weights="distance" ) neigh.fit(train_data, train_target) print("analysing test results...") # validate test test_result = neigh.predict(test_data) error = np.sqrt(mean_squared_error(test_target, test_result)) variance = explained_variance_score(test_target, test_result) print("Root mean squared error of test data: ", error) print("Variance: ", variance) print("predicting unknown data...") # predict and format values = neigh.predict(unknown_data) results = pandas.DataFrame({ "Instance": clean_unlabelled["Instance"].values, "Income": values.flatten() }) print("Finished.") return results
print("特征筛选已开始") clf = svm.SVC(kernel='linear') rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(8), scoring='accuracy') rfecv.fit(train_data, train_label) print("最佳特征数目为 : %d" % rfecv.n_features_) x_label = range(1, len(rfecv.grid_scores_) + 1) y_label = rfecv.grid_scores_ support = rfecv.support_ plt.figure() plt.xlabel(u"所选特征数量") plt.ylabel(u"交叉验证得分(分类精度)") plt.plot(x_label, y_label) plt.show() #获取有效特征 train_data1 = rfecv.transform(train_data) #准备需要验证的分类器 from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import MultinomialNB, GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.linear_model import SGDClassifier, LogisticRegression classifiers = { 'logistic re ': LogisticRegression(C=1.1, penalty='l1', tol=0.01), # 'SVC ':
def select_features_rfecv(X, y): """Return a new instance of the classification source X.""" estimator = LinearSVC() selector = RFECV(estimator, step=10) selector.fit(X, y) return selector.transform(X)
def stratShuffleSplitRFECVRandomForestClassification( nEstimators, iterator1, minSamplesSplit, maxFeatures, maxDepth, nFolds, targetDataMatrix, trainingData, trainingDataMatrix, SEED, ): """ :param nEstimators: This is the number of trees in the forest (typically 500-1000 or so) :param iterator1: This is the number of model iterations. For a breakdown of model structure, see the wiki (it's clearly marked...somewhere) :param minSamplesSplit: this is the minimum number of samples to split. 2 is a bit small...less is typically more. :param maxFeatures: :param nFolds: :param targetDataMatrix: :param trainingData: :param trainingDataMatrix: :param SEED: :return: """ import multiprocessing import numpy as np multiprocessing.cpu_count() # from helperFunctions import * import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn import metrics from sklearn import cross_validation from sklearn.feature_selection import RFECV from sklearn.cross_validation import StratifiedKFold from sklearn.cross_validation import StratifiedShuffleSplit # rfecv pre-allocation tables, seeding X_train = [] X_holdout = [] y_train = [] y_holdout = [] rfecvGridScoresAll = [] optimumLengthAll = [] # feature_names = [] a = [] rfc_all_f1 = [] nameListAll = pd.DataFrame() optimumLengthAll = pd.DataFrame() classScoreAll = pd.DataFrame() classScoreAll2 = pd.DataFrame() classScoreAll3 = pd.DataFrame() featureImportancesAll = pd.DataFrame() rfecvGridScoresAll = pd.DataFrame() # Re-definition of the RFC to employ feature importance as a proxy for weighting to employ RFECV. class RandomForestClassifierWithCoef(RandomForestClassifier): def fit(self, *args, **kwargs): super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs) self.coef_ = self.feature_importances_ ## Re-creation of the RFC object with ranking proxy coefficients rfc = RandomForestClassifierWithCoef( n_estimators=nEstimators, min_samples_split=minSamplesSplit, bootstrap=True, n_jobs=-1, max_features=maxFeatures, oob_score=True, max_depth=maxDepth, ) ## Employ Recursive feature elimination with automatic tuning of the number of features selected with CV (RFECV) # for kk in range(0, iterator1): print "iteration no: ", kk + 1 # Shuffle and split the dataset using a stratified approach to minimize the influence of class imbalance. SSS = StratifiedShuffleSplit(targetDataMatrix, n_iter=1, test_size=0.10, random_state=SEED * kk) for train_index, test_index in SSS: X_train, X_holdout = trainingDataMatrix[train_index], trainingDataMatrix[test_index] y_train, y_holdout = targetDataMatrix[train_index], targetDataMatrix[test_index] # Call the RFECV function. Additional splitting is done by stratification shuffling and splitting. 5 folds. 5 times, # with a random seed controlling the split. rfecv = RFECV( estimator=rfc, step=1, cv=StratifiedKFold(y_train, n_folds=nFolds, shuffle=True, random_state=SEED * kk), scoring="accuracy", ) # Can use 'accuracy' or 'f1' f1_weighted, f1_macro, f1_samples # First, the recursive feature elimination model is trained. This fits to the optimum model and begins recursion. rfecv = rfecv.fit(X_train, y_train) # Second, the cross-validation scores are calculated such that grid_scores_[i] corresponds to the CV score # of the i-th subset of features. In other words, from all the features to a single feature, the cross validation # score is recorded. rfecvGridScoresAll = rfecvGridScoresAll.append([rfecv.grid_scores_]) # Third, the .support_ attribute reports whether the feature remains after RFECV or not. The possible parameters are # inspected by their ranking. Low ranking features are removed. supPort = ( rfecv.support_ ) # True/False values, where true is a parameter of importance identified by recursive alg. possParams = rfecv.ranking_ min_feature_params = rfecv.get_params(deep=True) optimumLengthAll = optimumLengthAll.append([rfecv.n_features_]) featureSetIDs = list(supPort) featureSetIDs = list(featureSetIDs) # print feature_names feature_names = list(trainingData.columns.values) namedFeatures = list(trainingData.columns.values) namedFeatures = np.array(namedFeatures) # Loop over each item in the list of true/false values, if true, pull out the corresponding feature name and store # it in the appended namelist. This namelist is rewritten each time, but the information is retained. nameList = [] # Initialize a blank array to accept the list of names for features identified as 'True', # or important. # print featureSetIDs # print len(featureSetIDs) for i in range(0, len(featureSetIDs)): if featureSetIDs[i]: nameList.append(feature_names[i]) else: a = 1 # print("didn't make it") # print(feature_names[i]) nameList = pd.DataFrame(nameList) nameListAll = nameListAll.append(nameList) # append the name list nameList = list(nameList) nameList = np.array(nameList) # Fourth, the training process begins anew, with the objective to trim to the optimum feature and retrain the model # without cross validation i.e., test the holdout set. The new training test set size for the holdout validation # should be the entire 90% of the training set (X_trimTrainSet). The holdout test set also needs to be # trimmed. The same transformation is performed on the holdout set (X_trimHoldoutSet). X_trimTrainSet = rfecv.transform(X_train) X_trimHoldoutSet = rfecv.transform(X_holdout) # Fifth, no recursive feature elimination is needed (it has already been done and the poor features removed). # Here the model is trained against the trimmed training set X's and corresponding Y's. rfc.fit(X_trimTrainSet, y_train) # Holdout test results are generated here. preds = rfc.predict( X_trimHoldoutSet ) # Predict the class from the holdout dataset. Previous call: rfecv.predict(X_holdout) print preds print y_holdout rfc_all_f1 = metrics.f1_score(y_holdout, preds, average="weighted") # determine the F1 rfc_all_f2 = metrics.r2_score(y_holdout, preds) # determine the R^2 Score rfc_all_f3 = metrics.mean_absolute_error( y_holdout, preds ) # determine the MAE - Do this because we want to determine sign. # append the previous scores for aggregated analysis classScoreAll = classScoreAll.append([rfc_all_f1]) # append the previous scores for aggregated analysis. classScoreAll2 = classScoreAll2.append([rfc_all_f2]) classScoreAll3 = classScoreAll3.append([rfc_all_f3]) refinedFeatureImportances = ( rfc.feature_importances_ ) # determine the feature importances for aggregated analysis. featureImportancesAll = featureImportancesAll.append([refinedFeatureImportances]) # Output file creation print ("List of Important Features Identified by Recursive Selection Method:") print (nameListAll) nameListAll.to_csv("./outputFiles/class_IFIRS.csv") nameListAll.count() print ("f1 weighted score for all runs:") print (classScoreAll) classScoreAll.to_csv("./outputFiles/f1_score_all.csv") print ("R^2 score for all runs:") print (classScoreAll2) classScoreAll2.to_csv("./outputFiles/class_Rsq_score_all.csv") print ("MAE score for all runs:") print (classScoreAll3) classScoreAll3.to_csv("./outputFiles/class_MAE_score_all.csv") print ("Optimal number of features:") print (optimumLengthAll) optimumLengthAll.to_csv("./outputFiles/class_optimum_length.csv") print ("Selected Feature Importances:") print (featureImportancesAll) featureImportancesAll.to_csv("./outputFiles/class_sel_feature_importances.csv") print ("mean_squared_error Grid Score for Increasing Features") print (rfecvGridScoresAll) rfecvGridScoresAll.to_csv("./outputFiles/class_rfecv_grid_scores.csv")
""" Y = df_temp["cascadeSize"].between(step[i+1],10e6).tolist() #Y = df_temp["step"+str(step[i+1])].tolist() X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.2) logreg = linear_model.LogisticRegression(C=1e5,max_iter=1e3) logreg.fit(X_train,y_train) print "step"+str(step[i+1]) print X.shape print logreg.score(X_test,y_test) # perform recursive feature selection(backward selectoin) rfecv = RFECV(estimator=logreg, step=1, cv=StratifiedKFold(y_train, 4),scoring='accuracy') rfecv.fit(X_train, y_train) print("Optimal number of features : %d" % rfecv.n_features_) X_train_new = rfecv.transform(X_train) print("best features: ") print X_train_new # Plot number of features VS. cross-validation scores #plt.figure() #plt.xlabel("Number of features selected") #plt.ylabel("Cross validation score (nb of correct classifications)") #plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) #plt.show()
selector = SelectPercentile(f_classif, percentile=10) selector.fit(train_feature, train_target) scores = -np.log10(selector.pvalues_) scores /= scores.max() clf = svm.SVC(kernel='linear') clf.fit(train_feature, train_target) svm_weights = (clf.coef_ ** 2).sum(axis=0) svm_weights /= svm_weights.max() clf_selected = svm.SVC(kernel='linear') clf_selected.fit(selector.transform(train_feature), train_target) svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0) svm_weights_selected /= svm_weights_selected.max() ''' tree method''' import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import ExtraTreesClassifier forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
def predictAndPlot(data, header, features, name): print "\n%s" % name # First reduce the data to relevant features. features_plus_date = np.hstack((0, features)) analyzed_data = data[:, features_plus_date] # Remove rows with missing data. for i in range(len(analyzed_data[0])): analyzed_data = analyzed_data[analyzed_data[:, i] != ''] # If it is a retention feature, skip the last X entries. if "retention" in name: if "1d" in name: retention_feature_linesSkipped = 3 elif "3d" in name: retention_feature_linesSkipped = 7 elif "7d" in name: retention_feature_linesSkipped = 15 elif "14d" in name: retention_feature_linesSkipped = 29 elif "28d" in name: retention_feature_linesSkipped = 57 else: retention_feature_linesSkipped = 0 analyzed_data = analyzed_data[:-retention_feature_linesSkipped, :] # The second-last line is # votes. If smaller than 50, skip this entry. # analyzed_data = analyzed_data[analyzed_data[:, -2].astype(float) >= min_daily_regs] # I added the date to simply for plotting reasons. Just in case. Could be removed if not needed. dates = analyzed_data[:, 0] # Set best model and best score default values. best_model = "" best_score = -100 # Iterate through all models to obtain the best parameters and features via cross validation for model_type in list_of_models: # Get training data X and y. X = analyzed_data[:, 1:-1].astype(float) # Ignore dates (first column) and "y" (last column) y = analyzed_data[:, -1].astype(float) model = define_model(model_type) # Set model parameters based on model_type # Perform differently depending on which model is used. # Random Forest has to be treated differently because it doesn't support RFECV. if model_type == "RF": to_be_used_threshold = "median" # Default value. Will be overwritten. score = -100. # Loop through different thresholds. Use the one with the highest score. for model_threshold in ("10.*median", "3.*median", "1*median", "0.3*median", "0.1*median", "0.03*median"): try: # Use only the "model_threshold" best features. model.fit(X, y) X_new = model.transform(X, threshold=model_threshold) header_new = model.transform(header[features][:-1], threshold=model_threshold) # Fit the model again with reduced features X_new and return out of bag score. model.fit(X_new, y) rf_score = model.oob_score_ # I try to keep the amount of features as small as possible. # The rf_score of a model with more features needs to be 2% better to justify more params. # In some cases the score is negative so it also needs to be better overall. if (rf_score > score * 1.02) and (rf_score > score): score = rf_score to_be_used_threshold = model_threshold except: # Just a debug output. print "There was an error at model threshold: %s" % model_threshold print "Score is %2.3f with threshold: %s" % (score, to_be_used_threshold) elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"): selector = RFECV(model) selector = selector.fit(X, y) header_new = header[features][:-1] score = selector.score(X, y) print "Score is %2.3f with model: %s" % (score, model_type) else: print "Something went wrong!" if score > best_score: best_score = score best_model = model_type print "Best score is %2.3f with model: %s" % (best_score, best_model) # Predict using the best model, parameters and features, obtained before. model_type = best_model model = define_model(model_type) if model_type == "RF": # In some rare cases the model does not work, because all features were discarded. # Therefore try to do it again without a threshold, that should always work (model_threshold). try: model.fit(X, y) X_new = model.transform(X, threshold = to_be_used_threshold) header_new = model.transform(header[features][:-1], threshold=to_be_used_threshold) model.fit(X_new, y) prediction = model.predict(X_new) score = model.oob_score_ except: print "Fitting the model didn't work! The prediction might be sub-optimal. \nThreshold: %s" % model_threshold model.fit(X, y) prediction = model.predict(X) #score = model.oob_score_ score = 0 elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"): selector = RFECV(model) selector = selector.fit(X, y) header_new = header[features][:-1] prediction = selector.predict(X) score = selector.score(X, y) else: print "lol!" # Now derive the importances respectively feature coefficients. try: # This only works with "RF" importances = model.feature_importances_ importances_list = np.vstack((importances, header_new)) importances_list = np.transpose(importances_list) importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::-1] except: # This should work with all other models. try: X_new = selector.transform(X) header_new = selector.transform(header_new) model.fit(X_new, y) med_value = np.median(X_new, axis=0) med_value[med_value == 0] = np.mean(X_new, axis=0)[med_value == 0] importances = model.coef_ * np.median(X_new, axis=0) importances_list = np.vstack((importances, header_new)) importances_list = np.transpose(importances_list) importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::1] except: # If the above doesnt work, just give a blank output. importances_list = np.zeros((10, 2)) score = "%s, %s\nOOB Score = %2.2f" % (name, model_type, score) plot_predictionVsActual(prediction, y, score) return prediction, y, dates, importances_list
'Male' ]] y = ad_data['Clicked on Ad'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) l1 = LogisticRegression() l1.fit(X_train, y_train) p1 = l1.predict(X_test) from sklearn.model_selection import StratifiedKFold from sklearn.feature_selection import RFECV l2 = LogisticRegression() rfecv = RFECV(estimator=l2, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfecv.fit(X_train, y_train) print(rfecv.transform(X_train)[:1, :]) print(X_train.head(1)) print('By comparing the two we find the feature not selected') print('Number of best suited features using RFFECV') print(rfecv.n_features_) p2 = rfecv.predict(X_test) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_train) scaled_data = scaler.transform(X_train) from sklearn.decomposition import PCA pca = PCA(n_components=1) pca.fit(scaled_data) xtrain_pca = pca.transform(scaled_data) xtest_pca = pca.transform(scaler.transform(X_test)) l3 = LogisticRegression()
for clf_label, clf in classifiers.items(): # Print message to user print(f"Now working on {clf_label}.") #Define cross validation split method, scoring metric, total variance to keep for PCA, and parameter grid for optimization split = TimeSeriesSplit(n_splits=10) score = 'roc_auc' totalVariance = 0.99 param_grid = parameters[clf_label] # 1. Feature Selection: RFECV with clf as the base estimator selector = RFECV(estimator = clf,step=1, cv = split, scoring=score,n_jobs= -1) selector.fit(X_train_corr,y_train.values.ravel()) X_train_RFECV = selector.transform(X_train_corr) X_test_RFECV = selector.transform(X_test_corr) # 2. Dimension Reduction: PCA pca = PCA(totalVariance, svd_solver = 'full').fit(X_train_RFECV) X_train_PCA = pca.transform(X_train_RFECV) X_test_PCA = pca.transform(X_test_RFECV) df_results['Num_Features'][clf_label] = pca.n_components_ # 3. Hyper-parameter Optimization GSCV = GridSearchCV(clf, param_grid, cv = split, n_jobs= -1, scoring = score) # 4. Fit Model
from __future__ import division import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import RFECV X = np.load("../feats/train_formatted.npy") y = np.load("../feats/train_y.npy") X_test = np.load("../feats/test_formatted.npy") y_test = np.load("../feats/test_y.npy") clf = LogisticRegression() selector = RFECV(clf) selector.fit(X, y) X = selector.transform(X) X_test = selector.transform(X_test) scores = selector.ranking_ print 'Index : score' sortedIdx = [i[0] for i in sorted(enumerate(scores), key=lambda x:x[1])] top = 384 for i in range(top): print str(sortedIdx[i]) + ' : ' + str(scores[sortedIdx[i]]) clf.fit(X, y) pred = clf.predict(X_test) accuracy = sum(pred == y_test)/y_test.size print 'Logistic Regression Accuracy: ' + str(accuracy)
def Randomforest(features, classes): #import libraries from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn import preprocessing from sklearn.feature_selection import RFECV from sklearn.metrics import roc_auc_score import imblearn from imblearn.over_sampling import SMOTE, ADASYN from scipy import stats from sklearn import svm from sklearn.svm import SVC from sklearn.model_selection import StratifiedKFold #define empty arrays for results acc= [] AUC = [] # Optional: if using RFE for feature selection #define empty arrays for number of features and column numbers of selected features n_features=[] selected_features = np.zeros((len(X[1,:]))) #define k for number of iterations (cross validation k = 50 #start K-fold loop for i in range(0,k): print([i],) # To print every iteration to make progress durig running visible sys.stdout.flush() # To print the previous line on the screen immeidately. Without this, it is stored in a buffer and printed later. # Train-test split, percentage of test group tuned by 'test_size' # random-state=i makes sure every iteration used a unique subset as testing group X_train, X_test, y_train, y_test = train_test_split(features, classes, test_size=0.1, random_state=i) # Optional for unbalanced classes: resample training set by SMOTE # Make sure that amount of subjects in all classes are equal, makes use of synthetic subjects X_train_r, y_train_r = SMOTE().fit_sample(X_train, y_train) # Feature selection: Do t test for p < 0.05 #name the 2 classes in the training set class_1 = X_train[y_train == 1] class_2 = X_train[y_train == 0] h,p = stats.ttest_ind( class_1,class_2,equal_var = False,nan_policy='omit') treshold = p < 0.05 # set treshold for P < 0.05 p[treshold] = 0 # All low values set to 0 mask = p == 0 # define mask X_train = X_train[:,mask] X_test = X_test[:,mask] #standarization of training and testing data X_train_scaled = preprocessing.scale(X_train) X_test_scaled = preprocessing.scale(X_test) # Optional: # Feature selection: Using Recursive feature extraction (RFE) svc = SVC(kernel="linear") # selects classifier to provide information about feature importance selector = RFECV(estimator=svc, step=1, cv=StratifiedKFold(10)) # set umber of features to remove at each iteration, set number of iterations in corss validation selector = selector.fit(X_train_scaled, y_train) #fit the RFE to the training set X_train_FS = selector.transform(X_train_scaled) #extract selected features from the training set X_test_FS = X_test_scaled[:,selector.support_] #extract selected features from the testing set, according to outcome of RFE executed in the training set n_features.append(selector.n_features_) #fill in number of selected features per iteration selected_features[mask] = selected_features[mask]+selector.support_ # fill in which features are selected per iteration # RandomForest classification t=100 # define number of trees clf = RandomForestClassifier(n_estimators=t) # define classifier clf = clf.fit(X_train_scaled, y_train) # fit classifier to training and testing data score = clf.score(X_test_scaled, y_test) # define accuracy score acc.append(score) # fill in accuracy to accuracy-array score_AUC = clf.predict_proba(X_test_scaled) # define Area under the ROC curve (AUC) score score_AUC = score_AUC[:,1] ROC_AUC = roc_auc_score(y_test, score_AUC) AUC.append(ROC_AUC) # fill in AUC to AUC-array # Print statements print('accuracies: \n',acc) print('accuracies by a k fold CV Random Forest: '+ str(np.mean(acc)) + ' ( std : ' + str(np.std(acc)) + ' )' ) print('AUCs: \n',AUC) print('AUC by a k fold CV Random Forest: '+ str(np.mean(AUC)) + ' ( std : ' + str(np.std(AUC)) + ' )' ) # Define dataframe with performance scores df = pd.DataFrame({'acc_coarse': scores,'AUC_coarse': AUC}) # export outcomes to a csv_file on local map df.to_csv('name_file.csv', encoding='utf-8', index=False)
proba1 = [i for index, i in enumerate(knn.predict_proba(testX))] print (pd.Series(proba1))''' #Testing test = pd.read_csv('data/test.csv', encoding='utf-8') '''sk = SelectKBest(f_regression, k=60) sk.fit(X,y) X = sk.transform(X) test = sk.transform(test)''' rfe = RFECV(LinearSVC(), step = 1) rfe.fit(X, y) X = rfe.transform(X) test = rfe.transform(test) knn = KNeighborsClassifier(n_neighbors=90, leaf_size=10, p=2) knn.fit(X, y) pred = np.array(knn.predict(test)) proba = [i for index, i in enumerate(knn.predict_proba(test))] print (pd.Series(proba)) probadf = pd.DataFrame(proba, columns=['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']) nrow = probadf.shape[0]+1 ids = pd.Series(np.arange(nrow)) ids = ids.drop(0) result = pd.concat([ids, probadf], axis=1) result.to_csv('Submission.csv', header=True, index=None) import scipy as sp
#select features using rfecv only on train data #rfe = RFE(estimator=classifier, cv=5,n_features_to_select=10,step=2) rfe = RFECV(estimator=classifier, cv=5,step=2, scoring='f1') print("going to select optimal features") rfe.fit(normalized_matrix_train, y_all[train]) ranked_features=(rfe.ranking_).tolist() #print("shape of train matrix after rfe.fit is: " +str(normalized_matrix_train.shape)) index=[] for i in range(0,len(ranked_features)): if ranked_features[i] is 1: index.append(i) print("index is"+str(index)) rfe.transform(normalized_matrix_train) #print("shape of transformed train matrix is: " +str(normalized_matrix_train.shape)) classifier.fit(normalized_matrix_train,y_all[train]) rfe.transform(normalised_matrix_test) #print("shape of transformed test matrix is: " +str(normalised_matrix_test.shape)) probas_ = classifier.predict_proba(normalised_matrix_test) ########## ADDING VARIABLES FOR CLASSIFICATION REPORT HERE #################### y_proba_report.extend(probas_) y_predicted2=(classifier.predict(normalised_matrix_test)) print("f1-score for this set of features is: "+ str(f1_score(y_all[test],y_predicted2))) clf_score=classifier.score(normalised_matrix_test, y_all[test]) print("score for this set of features is: "+ str(clf_score)) y_predicted_report.extend(y_predicted2)
class trainModel(object): ''' model training - a-site prediction ''' def __init__(self, asiteFn=None, cdsFn=None, cdsIdxFn=None, classifier="rf", RelE=None): self.asiteFn = asiteFn self.cdsFn = cdsFn self.cdsIdxFn = cdsIdxFn self.classifier = classifier self.RelE = RelE def rfFit(self): self.traning = pd.read_table(self.asiteFn + ".txt", header=0) # column names self.colNames = list(self.traning.columns.values) self.colNames.remove("asite") self.X = np.array(pd.get_dummies(self.traning[self.colNames])) self.y = np.array(self.traning["asite"]) ## feature selection self.clf = RandomForestClassifier(max_features=None, n_jobs=-1) self.clf = self.clf.fit(self.X, self.y) self.importances = self.clf.feature_importances_ self.selector = RFECV(self.clf, step=1, cv=5) self.selector = self.selector.fit(self.X, self.y) self.sltX = self.selector.transform(self.X) print( "[result]\tOptimal number of features by recursive selection: %d" % self.selector.n_features_, flush=True) ## define a new classifier for reduced features self.reducedClf = RandomForestClassifier(max_features=None, n_jobs=-1) self.reducedClf = self.reducedClf.fit(self.sltX, self.y) ## cross validation scores = cross_val_score(self.reducedClf, self.sltX, self.y, cv=10) print("[result]\tAccuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2), flush=True) def rfImportance(self): ## compute the std and index for the feature importance std = np.std( [tree.feature_importances_ for tree in self.clf.estimators_], axis=0) idx = np.argsort(self.importances)[::-1] featureNames = (pd.get_dummies( self.traning[self.colNames]).columns.values) importantFeatures = featureNames[idx] ## Plot the feature importances of the classifier plt.figure() plt.title("Feature importances") plt.bar(range(self.X.shape[1]), self.importances[idx], color=sns.xkcd_rgb["denim blue"], yerr=std[idx], align="center") plt.xticks(range(self.X.shape[1]), importantFeatures, rotation='vertical') plt.xlim([-1, 10]) plt.ylim([0, 1]) #plt.gca().tight_layout() plt.gcf() plt.savefig(self.asiteFn + ".feature_importances.pdf", facecolor="white") def rfPredict(self): ## create df for cds self.cds = pd.read_table(self.cdsFn + ".txt", header=0) cdsX = np.array(pd.get_dummies(self.cds[self.colNames])) ## selected a subset of features and predict a-site sltcdsX = self.selector.transform(cdsX) self.cds["asite"] = self.reducedClf.predict(sltcdsX) def svmFit(self): ## grid search self.clf = svm.SVC() paramGrid = [{'C': [0.01, 0.1, 1, 10, 100, 1000, 10000]}] self.clfGs = GridSearchCV(estimator=self.clf, param_grid=paramGrid, n_jobs=-1) self.clfGs.fit(self.X, self.y) print("[result]\t best estimator parameters: c=", self.clfGs.best_estimator_.C, flush=True) ## model fitting and cross validation self.clf = svm.SVC(C=self.clfGs.best_estimator_.C) scores = cross_val_score(self.clf, self.X, self.y, cv=10) print("[result]\tAccuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2), flush=True) def rocCurve(self): ''' plotting multi-class roc curve ''' # shuffle and split training and test sets clf = self.reducedClf if self.classifier == "rf" else self.clf self.OvrClf = OneVsRestClassifier(clf) classes = list(range(9, 19)) if not self.RelE else list(range(1, 9)) self.y = label_binarize(self.y, classes=classes) nClasses = self.y.shape[1] X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=.5, random_state=0) if self.classifier == "rf": y_score = self.OvrClf.fit(X_train, y_train).predict_proba(X_test) else: y_score = self.OvrClf.fit(X_train, y_train).decision_function(X_test) # Compute ROC curve and ROC area for each class fpr, tpr, roc_auc = {}, {}, {} for i in range(nClasses): fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Plot ROC curve sns.reset_orig() plt.clf() plt.figure() plt.plot(fpr["micro"], tpr["micro"], '--', linewidth=3, label='micro-average (area = {0:0.2f})' ''.format(roc_auc["micro"])) for i in range(nClasses): pos = classes[i] plt.plot(fpr[i], tpr[i], label='A-site @ {0} (area = {1:0.2f})' ''.format(pos, roc_auc[i])) # plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate', fontsize=18) plt.ylabel('True Positive Rate', fontsize=18) plt.tick_params(axis='both', which='major', labelsize=18) plt.legend(loc="lower right", fontsize=12) plt.gcf() plt.savefig(self.asiteFn + ".roc.pdf") def recoverAsite(self): ## adjust by the a-site location and calculate the a-site location in nt space, -1 is the missing value if not self.RelE: self.cds['a_start'] = np.where( self.cds['gene_strand'] == '+', (self.cds['start'] + self.cds['asite']), (-1)).astype(int) self.cds['a_end'] = np.where( self.cds['gene_strand'] == '+', (self.cds['a_start'] + 3), (self.cds['end'] - self.cds['asite'])).astype(int) self.cds['a_start'] = np.where(self.cds['gene_strand'] == '-', (self.cds['a_end'] - 3), (self.cds['a_start'])).astype(int) else: self.cds['a_start'] = np.where( self.cds['gene_strand'] == '+', (self.cds['end'] - self.cds['asite']), (-1)).astype(int) self.cds['a_end'] = np.where( self.cds['gene_strand'] == '+', (self.cds['a_start'] + 3), (self.cds['start'] + self.cds['asite'])).astype(int) self.cds['a_start'] = np.where(self.cds['gene_strand'] == '-', (self.cds['a_end'] - 3), (self.cds['a_start'])).astype(int) # remove start/end for reads self.cds.drop(['start', 'end'], axis=1, inplace=True) ## use to group by command to retrieve ribosome coverage cnt = self.cds.groupby(["chrom", "a_start", "a_end", "strand"]) cnt = cnt.size().reset_index(name="ribosome_count") ## left outer join the null df and the groupby_df_count to get ribsome counts at each position cdsIdx = pd.read_table(self.cdsIdxFn, header=0) riboCnt = pd.merge(cdsIdx, cnt, how="left", left_on=["chrom", "start", "end", "gene_strand"], right_on=["chrom", "a_start", "a_end", "strand"]) riboCnt.drop(['a_start', 'a_end', 'strand'], axis=1, inplace=True) riboCnt["ribosome_count"].fillna(value=0, inplace=True) riboCnt["ribosome_count"] = riboCnt["ribosome_count"].astype(int) riboCnt = riboCnt.sort_values(by=["chrom", "start", "end"]) riboCnt.to_csv(path_or_buf=self.cdsFn + '.model_input.txt', sep='\t', header=True, index=False)
y_true, y_pred = y_test, clf.predict(X_test) print(classification_report(y_true, y_pred)) print() print(confusion_matrix(y_true, y_pred)) print(best_score ,clf.best_score_) if i == 1: break else: best_score = clf.best_score_ # remove some features rfecv = RFECV(estimator=clf.best_estimator_, step=1, cv=2, scoring='accuracy') rfecv.fit(X_train, y_train) print("Optimal number of features : %d" % rfecv.n_features_) X_train = rfecv.transform(X_train) X_test = rfecv.transform(X_test) for j in range(5): print(j) magDict = {} with hdf.File('./truth/truth'+str(j).zfill(2)+'_Oii.hdf5', 'r') as f: dset = f['truth%s_Oii' % (str(j).zfill(2))] magDict['u'] = dset['OMAG'][:,0] # u band magDict['g'] = dset['OMAG'][:,1] # g band magDict['r'] = dset['OMAG'][:,2] # r band magDict['i'] = dset['OMAG'][:,3] # i band magDict['z'] = dset['OMAG'][:,4] # z band # we only want the g mag < 22 galaxies mask = np.where(magDict['g'] < 22)[0]
print("Reduced number of features:", features_kbest.shape[1]) #For top n features fvalue_selector = SelectPercentile(f_classif, percentile=75) features_kbest = fvalue_selector.fit_transform(features, target) print("Original number of features:", features.shape[1]) print("Reduced number of features:", features_kbest.shape[1]) #Recursively Eliminating Features warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd") features, target = make_regression(n_samples = 10000, n_features = 100, n_informative = 2, random_state = 1) ols = linear_model.LinearRegression() rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error") rfecv.fit(features, target) print(rfecv.transform(features)) print(rfecv.n_features_) print(rfecv.support_) print(rfecv.ranking_)
elif classifier == "RandomForest": estimator = RandomForestClassifier(max_depth=2) #clf1 = RandomForestClassifier() elif classifier == "Adaboost": estimator = AdaBoostClassifier() #clf1 = AdaBoostClassifier() else: estimator = LogisticRegression() #clf1 = LogisticRegression() rfecv = RFECV(estimator, step=1, cv=StratifiedKFold(2)) rfecv.fit(x_data, y_data) print('number of features selected:', rfecv.n_features_) x_new = rfecv.transform(x_data) #### Extract the important features ########### #selected_inds = rfecv.get_support(indices=True) #feat_coefs = rfecv.estimator_.coef_ #print(feat_coefs) #selected_feats = [training_head[ind] for ind in selected_inds] #selected_vals = list(zip(selected_feats, feat_coefs[0])) #feature_frame_selected = pd.DataFrame(selected_vals, columns=['selected_features','Coefficients']) #feature_frame_selected = feature_frame_selected.sort_values(["ranking"],ascending=False) #feat_file = 'selected_features_rfecv_finalresults_1_19_19/withmetrics/final_feature_set_'+filename+"_"+cur_model+"_"+device+".csv" #feature_frame_selected.to_csv(feat_file)
class FeatureVectorsClassifier(Model): """ This class represents a task 1, subtask A model that trains pattern recognition classifiers on feature vectors extracted from the individual images. """ def __init__(self, classifier, kwargs): """Constructs a supervised task1, subtask A model based on pattern recognition classifiers. Parameters: classifier The provided supervised classifier. kwargs The parameters for feature generation. """ assert classifier in CLASSIFIERS self.classifier = classifier self.features = Features(**kwargs) self.scaler = StandardScaler() self.feature_preselector = SelectKBest(chi2, k=NUM_FEATURES) self.feature_selector = RFECV( self.classifier, scoring=SCORING, cv=StratifiedKFold(NUM_FOLDS, random_state=RANDOM_STATE), n_jobs=-1) def fit(self, videos): LOGGER.debug("Preparing training samples for %s ...", self) X = [] y = [] for video_num, video in enumerate(videos): LOGGER.debug("Processing video number %d / %d ...", video_num + 1, len(videos)) for screen in video.screens: for page in video.pages: LOGGER.debug("Processing (%s, %s) ...", page, screen) X.append(self.features.get_pairwise_features(page, screen)) y.append(1 if page in screen.matching_pages else 0) LOGGER.debug("Done processing (%s, %s).", page, screen) LOGGER.debug("Done processing video number %d / %d.", video_num + 1, len(videos)) LOGGER.debug("Done preparing training samples for %s.", self) LOGGER.debug( "Fitting the feature preselector (%d samples, %d features) ...", len(X), len(X[0])) self.feature_preselector.fit(X, y) X = self.feature_preselector.transform(X) LOGGER.debug("Done fitting the feature preselector (%d features).", X.shape[1]) LOGGER.debug("Fitting the feature scaler ...") self.scaler.fit(X) X = self.scaler.transform(X) LOGGER.debug("Done fitting the feature scaler.") if self.classifier.__class__ != SVC: LOGGER.debug( "Fitting the feature selector (%d samples, %d features) ...", *X.shape) self.feature_selector.fit(X, y) X = self.feature_selector.transform(X) LOGGER.debug("Done fitting the feature selector. (%d features)", X.shape[1]) if self.classifier.__class__ in PARAM_GRIDS and self.classifier.__class__ != SVC: LOGGER.debug( "Optimizing the classifier parameters and fitting the classifier ..." ) param_grid = PARAM_GRIDS[self.classifier.__class__] optimizer = GridSearchCV(self.classifier, param_grid, scoring=SCORING, refit=True, cv=StratifiedKFold( NUM_FOLDS, random_state=RANDOM_STATE)) optimizer.fit(X, y) self.classifier = optimizer.best_estimator_ LOGGER.debug( "Done optimizing the classifier parameters and fitting the classifier." ) else: LOGGER.debug("Fitting the classifier ...") self.classifier.fit(X, y) LOGGER.debug("Done fitting the classifier.") def predict(self, observations): rankings = [] for observation_num, (screen_video, page_video) in enumerate(observations): LOGGER.debug("Processing observation number %d / %d ...", observation_num + 1, len(observations)) screens = screen_video.screens pages = page_video.pages for screen in screens: LOGGER.debug("Processing %s ...", screen) X = [] for page in pages: LOGGER.debug("Processing %s ...", page) X.append(self.features.get_pairwise_features(page, screen)) LOGGER.debug("Done processing %s.", page) X = self.feature_preselector.transform(X) X = self.scaler.transform(X) if self.classifier.__class__ != SVC: X = self.feature_selector.transform(X) ranking = self._predict_confidence(X) rankings.append(ranking) LOGGER.debug("Done processing %s.", screen) LOGGER.debug("Done processing observation number %d / %d.", observation_num + 1, len(observations)) return rankings def _predict_confidence(self, X): """Produces confidence scores for class 1 for each of the provided feature vectors. Parameters: X The list of provided feature vectors.""" assert "decision_function" in dir(self.classifier) \ or "predict_proba" in dir(self.classifier) if "decision_function" in dir(self.classifier): confidence = self.classifier.decision_function(X) else: confidence = self.classifier.predict_proba(X)[:, 1] return confidence def _filename(self): return "%s.%s-%s-%s" % (__name__, self.__class__.__name__, self.features.__repr__(), self.classifier.__class__.__name__) def __repr__(self): return "Feature vectors classifier (%s, %s)" % ( self.features, self.classifier.__class__.__name__)
plt.show() print( "\n--------------------------Recursive feature elimination-------------------------\n" ) from sklearn.feature_selection import RFECV rfe = RFECV(estimator=LogisticRegression(random_state=rs), cv=10) rfe.fit(X_train_log, y_train_log) # run the RFECV # comparing how many variables before and after print("Original feature set", X_train_log.shape[1]) print("Number of features after elimination", rfe.n_features_) X_train_sel_log = rfe.transform(X_train_log) X_test_sel_log = rfe.transform(X_test_log) print("Features sorted by their rank:") print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), feature_names))) # init grid search CV on transformed dataset cv.fit(X_train_sel_log, y_train_log) print( "\n--------------------------Test the best model-------------------------\n" ) # test the best model print("Train accuracy:", cv.score(X_train_sel_log, y_train_log)) print("Test accuracy:", cv.score(X_test_sel_log, y_test_log))
import warnings warnings.filterwarnings("ignore") rfe2 = RFECV(estimator = DecisionTreeClassifier(random_state=rs, max_depth=8, min_samples_leaf=30), cv=10) rfe2.fit(X_train, y_train) print("Original feature set", X_train.shape[1]) print("number of features after elimination", rfe2.n_features_) #Before: 100, after: 12 # In[790]: #the dataset with the RFE with logistic regression model X_train_sel = rfe.transform(X_train) X_test_sel = rfe.transform(X_test) # In[791]: #the dataset with the RFE with decision tree model X_train_nn = rfe2.transform(X_train) X_test_nn = rfe2.transform(X_test) # In[843]: print(rfe2.support_)
print(mean_squared_error(y_Train2, y_hat)) print(mean_squared_error(y_Test2, y_hatTest)) print(mean_absolute_error(y_Train2, y_hat)) print(mean_absolute_error(y_Test2, y_hatTest)) print(r2_score(y_Train2, y_hat)) print(r2_score(y_Test2, y_hatTest)) # In[98]: # RandomForest Regressor y1 rfrmodel = RFECV(RandomForestRegressor(), cv=3, scoring='neg_mean_squared_error', step=1) rfrmodel.fit(x_Train, y_Train1) x_rfr = rfrmodel.transform(x_Train) print(x_Train.shape) print(x_rfr.shape) print(rfrmodel.support_) y_hatRFR = rfrmodel.predict(x_Train) y_hatRFRTest = rfrmodel.predict(x_Test) print('Random Forrest Regression Results (Y1)') print(mean_squared_error(y_Train1, y_hatRFR)) print(mean_squared_error(y_Test1, y_hatRFRTest)) print(mean_absolute_error(y_Train1, y_hatRFR)) print(mean_absolute_error(y_Test1, y_hatRFRTest)) print(r2_score(y_Train1, y_hatRFR)) print(r2_score(y_Test1, y_hatRFRTest)) # In[99]:
#%% =============================特征工程================================ # 降维 st = time.time() pca = PCA(n_components=0.95, random_state=666) feature_train_ = pca.fit_transform(feature_train_) feature_validation_ = pca.transform(feature_validation_) feature_test_ = pca.transform(feature_test_) et = time.time() print(f"Running time of pca is {et-st:.3f}") # 递归特征消除筛选特征 st = time.time() selector = RFECV(LinearSVC(random_state=666), step=0.2, cv=5, n_jobs=3) selector = selector.fit(feature_train_, label_train) feature_train_ = selector.transform(feature_train_) feature_validation_ = selector.transform(feature_validation_) feature_test_ = selector.transform(feature_test_) et = time.time() print(f"Running time of RFECV is {et-st:.3f}") #%% =============================训练模型================================ # 训练单一模型 model = LinearSVC(C=1, random_state=666) model.fit(feature_train_, label_train) # # 模型融合 # clf1 = LogisticRegression(random_state=666) # clf2 = RidgeClassifier(random_state=666) # clf3 = LinearSVC(C=1, random_state=666) # clf4 = SVC(C=1, kernel="sigmoid")
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1) rfecv.fit(X, y) # non-regression test for missing worst feature: assert len(rfecv.grid_scores_) == X.shape[1] assert len(rfecv.ranking_) == X.shape[1] X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_))) # In the event of cross validation score ties, the expected behavior of # RFECV is to return the FEWEST features that maximize the CV score. # Because test_scorer always returns 1.0 in this example, RFECV should # reduce the dimensionality to a single feature (i.e. n_features_ = 1) assert rfecv.n_features_ == 1 # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2) rfecv.fit(X, y) assert len(rfecv.grid_scores_) == 6 assert len(rfecv.ranking_) == X.shape[1] X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Verifying that steps < 1 don't blow up. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data)
# plt.figure() # plt.plot(lr_coef_mean.T, 'b', linewidth=1) # plt.plot(lr_coef_mean.T + lr_coef_sem.T, 'b--', linewidth=1) # plt.plot(lr_coef_mean.T - lr_coef_sem.T, 'b--', linewidth=1) # plt.xticks(np.arange(0, 168, 1), labels, rotation='vertical') # plt.margins(0.4) # # Tweak spacing to prevent clipping of tick-labels # plt.subplots_adjust(bottom=0.15) rfecv = RFECV(estimator=lr_mean, step=1, cv=StratifiedKFold(9), scoring='roc_auc') rfecv.fit(X, y) X_rfecv = rfecv.transform(X) rfecv_scores = cross_val_score(lr_mean, X_rfecv, y, scoring="roc_auc", cv=StratifiedKFold(9)) score_rfecv, perm_scores_rfecv, pvalue_rfecv = permutation_test_score( lr_mean, X_rfecv, y, scoring="roc_auc", cv=StratifiedKFold(9), n_permutations=2000, n_jobs=2)
def rfe(self,n=None,rfe_model=None): if isinstance(n,int)==True: if rfe_model==None: from sklearn.linear_model import LogisticRegression rfe_model=LogisticRegression(solver='lbfgs') kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state) selector = RFECV(estimator=rfe_model, min_features_to_select = n, cv=kfold, n_jobs=-1).fit(self.X,self.y) keep = [i for i in range(0,len(selector.support_)) if selector.support_[i]==True] self.X=self.X.iloc[:,keep] else: kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state) selector = RFECV(estimator=rfe_model, min_features_to_select = n, cv=kfold, n_jobs=-1).fit(self.X,self.y) keep = [i for i in range(0,len(selector.support_)) if selector.support_[i]==True] self.X=self.iloc[:,keep] else: n = ''.join(n.split()).lower() n = n[:3] if n == 'opt': if rfe_model==None: if self.X.shape[1]>1000: steps=int((round(floor(self.X.shape[1]),-3)/1000)*8) else: steps=1 nof_list=np.arange(self.X.shape[1]-1,1,step=-steps) #print(nof_list) check_point=np.arange(1,self.X.shape[1]-1,step=floor(0.1*self.X.shape[1])) #print(check_point) high_score=0 #Variable to store the optimum features n_best=0 score_list =[] X_train, X_test, y_train, y_test = train_test_split(self.X,self.y, test_size = 0.2, random_state = self.random_state) print("Optimizing...") for i in nof_list: #print("Testing n = ",i) num_col=X_train.shape[1] from sklearn.linear_model import LogisticRegression rfe_model=LogisticRegression(solver='lbfgs') kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state) selector = RFECV(estimator=rfe_model, min_features_to_select = i, cv=kfold, n_jobs=-1).fit(X_train,y_train) X_train_rfe=selector.transform(X_train) cols_kept = selector.get_support(indices=True) drop_cols=set(np.arange(0,num_col))-set(cols_kept) drop_cols=list(drop_cols) X_test_rfe=selector.transform(X_test) model=LogisticRegression(solver='lbfgs') model.fit(X_train_rfe,y_train) preds = model.predict(X_test_rfe) score = roc_auc_score(y_test, preds) score_list.append(score) if(score>high_score): high_score = score n_best = i if i in check_point: print(" Best n so far: {} \n Score: {} \n".format(n_best,high_score)) X_train=X_train.drop(X_train.columns[drop_cols],axis=1) X_test=X_test.drop(X_test.columns[drop_cols],axis=1) print("Optimal n: {} \n Score: {} \n".format(n_best,high_score)) rfe_model=LogisticRegression(solver='lbfgs') kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state) selector = RFECV(estimator=rfe_model, min_features_to_select = n_best, cv=kfold, n_jobs=-1).fit(self.X,self.y) keep = [i for i in range(0,len(selector.support_)) if selector.support_[i]==True] self.X=self.X.iloc[:,keep] else: #allow for other model support pass
def main(): #ファイル名は修正して使用する。 print('学習データは [./data]配下に格納してください') print('学習データのファイル名(csv)を入力してください') train_file = input('>> ') print('検証データは [./data]配下に格納してください') print('検証データのファイル名(csv)を入力してください') test_file = input('>> ') drop_columns = list() df = pd.read_csv('./data/' + train_file, ',') df_test = pd.read_csv('./data/' + test_file, ',') print(df.nunique(dropna=False)) print('正解データの列名を入力してください') print(df.columns.values) ans_col = list() ans_col_name = input('>> ') ans_col.append(ans_col_name) #正解データ列の指定 y_train = df.loc[:, ans_col] print('検証データのid名を入力してください') print(df_test.columns.values) test_id = input('>> ') y_id = pd.DataFrame(df_test.loc[:, test_id]) print(df.dtypes) print('不要な列名を入力してください(複数ある場合は「,」で区切って入力)') print(df.columns.values) drop_col = input('>> ') if drop_col != '': drop_index = drop_col.find(',') if drop_index != -1: drop_columns = drop_col.split(',') else: drop_columns = drop_col #不要列削除 if drop_columns != '': df = df.drop(drop_columns, axis=1) df_test = df_test.drop(drop_columns, axis=1) #学習データから正解データを削除 df = df.drop(ans_col, axis=1) #カテゴリ変数 list_category = list() for category_columns in df.columns: if df[category_columns].dtypes == object: list_category.append(category_columns) print('カテゴリ変数') print(list_category) ##################### #----- モデル用 -----# #################### print('モデル用の前処理開始') df_ohe = one_hot_encoding(df, list_category) print('ワンホットエンコーディング後サイズ:' + str(df_ohe.shape)) imp = SimpleImputer() imp.fit(df_ohe) df_ohe = pd.DataFrame(imp.transform(df_ohe), columns=df_ohe.columns.values) rf = RandomForestClassifier(random_state=1) rf.fit(df_ohe, y_train) #特徴選択 #select = RFECV(RandomForestClassifier(n_estimators=100, random_state=1), min_features_to_select=10,step=0.05) select = RFECV(estimator=rf) select.fit(df_ohe, y_train) #特徴選択後のサイズ X_train = select.transform(df_ohe) X_train = pd.DataFrame(X_train, columns=df_ohe.columns.values[select.support_]) print('前処理完了後サイズ:' + str(X_train.shape)) #重要度 importances = pd.DataFrame({ "features": df_ohe.columns, "importances": rf.feature_importances_, "select": select.support_ }) print(importances) ##################### #----- スコア用 -----# #################### print('スコア用の前処理開始') df_test_ohe = one_hot_encoding(df_test, list_category) print('ワンホットエンコーディング後サイズ:' + str(df_test_ohe.shape)) # モデルと整合を合わせる X_test = check_columns(X_train, df_test_ohe) imp.fit(X_test) X_test = pd.DataFrame(imp.transform(X_test), columns=X_test.columns.values) print('前処理完了後サイズ:' + str(X_test.shape)) #select_score = 'f1' select_score = 'roc_auc' scores = learn_and_score(X_train, y_train, select_score, train_file[:-4]) print('選択評価指標:' + select_score) print('######## 評価結果 ########') print(pd.Series(scores).sort_values(ascending=False)) print('検証に使用するモデルの略称入力してください') for key in scores.keys(): print(key) modelname = '' model_num = input('>>') if model_num != '': modelname = model_num model = load_model('./model/' + train_file[:-4] + '_' + modelname + '_learned.pkl') #予想確率 #pre = pd.DataFrame(model.predict_proba(X_test), columns=ans_col) #予測 pre = pd.DataFrame(model.predict(X_test), columns=ans_col) score = y_id.join(pre) if os.path.isdir('./pred') == False: os.mkdir('./pred') pred_name = './pred/' + test_file[:-4] + '_' + modelname + '_pred.csv' score.to_csv(pred_name, index=False) print('検証結果を' + pred_name + 'に保存しました')
# Load libraries from sklearn.datasets import make_regression from sklearn.feature_selection import RFECV from sklearn import datasets, linear_model import warnings # Suppress an annoying but harmless warning warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd") # Generate features matrix, target vector, and the true coefficients X, y = make_regression(n_samples=10000, n_features=100, n_informative=2, random_state=1) # Create a linear regression ols = linear_model.LinearRegression() # Create recursive feature eliminator that scores features by mean squared errors rfecv = RFECV(estimator=ols, step=1, scoring='neg_mean_squared_error') # Fit recursive feature eliminator rfecv.fit(X, y) # Recursive feature elimination rfecv.transform(X) # Number of best features rfecv.n_features_
#select features using rfecv only on train data #rfe = RFE(estimator=classifier, cv=5,n_features_to_select=10,step=2) rfe = RFECV(estimator=classifier, cv=5, step=2, scoring='f1') print("going to select optimal features") rfe.fit(normalized_matrix_train, y_all[train]) ranked_features = (rfe.ranking_).tolist() #print("shape of train matrix after rfe.fit is: " +str(normalized_matrix_train.shape)) index = [] for i in range(0, len(ranked_features)): if ranked_features[i] is 1: index.append(i) print("index is" + str(index)) rfe.transform(normalized_matrix_train) #print("shape of transformed train matrix is: " +str(normalized_matrix_train.shape)) classifier.fit(normalized_matrix_train, y_all[train]) rfe.transform(normalised_matrix_test) #print("shape of transformed test matrix is: " +str(normalised_matrix_test.shape)) probas_ = classifier.predict_proba(normalised_matrix_test) ########## ADDING VARIABLES FOR CLASSIFICATION REPORT HERE #################### y_proba_report.extend(probas_) y_predicted2 = (classifier.predict(normalised_matrix_test)) print("f1-score for this set of features is: " + str(f1_score(y_all[test], y_predicted2))) clf_score = classifier.score(normalised_matrix_test, y_all[test]) print("score for this set of features is: " + str(clf_score)) y_predicted_report.extend(y_predicted2) y_test_report.extend(y_all[test])
def XGB_ModelBuilder(X_train, y_train, X_test, y_test, X_unknown=[]): # XGB_ModelBuilder.py # Created by KAC on 02/12/2020 """ This function takes in data and completes a grid search to tune parameters automatically. It then makes predictions and calculates an MAE score for those predictions.""" import numpy as np import pandas as pd from sklearn.feature_selection import RFECV from sklearn.metrics import log_loss from xgboost import XGBClassifier as XGB from sklearn.model_selection import cross_val_score, RandomizedSearchCV from sklearn.metrics import make_scorer # scorer = make_scorer(log_loss, greater_is_better=False) XGB_model = XGB() selector = RFECV(estimator=XGB_model, scoring='neg_log_loss', cv=5) selector.fit(X_train, y_train) CV_score = cross_val_score(selector, X_train, y_train, scoring='neg_log_loss', cv=5) scr = np.mean(CV_score) print( pd.DataFrame({ 'Variable': X_train.columns, 'Importance': selector.ranking_ }).sort_values('Importance', ascending=True).head(50)) print("Optimal number of features: ", selector.n_features_) print("Log Loss for All Features: ", scr) if selector.n_features_ < len(X_train.columns): X_train_transformed = selector.transform(X_train) X_test_transformed = selector.transform(X_test) CV_score = cross_val_score(selector, X_train_transformed, y_train, scoring='neg_log_loss', cv=5) scr = np.mean(CV_score) print("Log Loss for Selected Features on Training Data: ", scr) else: X_train_transformed = X_train X_test_transformed = X_test print( "Not optimal to remove features. Proceeding to parameter tuning.") # Current Best: {'subsample': 0.9, 'n_estimators': 250, 'min_child_weight': 2, 'max_depth': 8, 'learning_rate': 0.02, 'colsample_bytree': 0.85} parameters = { "learning_rate": [0.01, 0.015, 0.02, 0.025, 0.03], #[0.01, 0.05, 0.1], "n_estimators": [250, 500, 600], #[500, 750, 1000], "max_depth": [8, 9, 10, 12], #[3, 6, 9], "min_child_weight": [2, 5, 8], #[1, 2], "colsample_bytree": [0.7, 0.75, 0.8, 0.85], #[0.5, 0.75, 1], "subsample": [0.9, 1] #[0.5, 0., 1] } rsearch = RandomizedSearchCV(estimator=XGB_model, param_distributions=parameters, scoring='neg_log_loss', n_iter=250, cv=5) #XGB_model rsearch.fit(X_train_transformed, y_train) print(rsearch.best_params_) CV_score = cross_val_score(rsearch, X_train_transformed, y_train, scoring='neg_log_loss', cv=5) scr = np.mean(CV_score) print( "Log Loss for Selected Features and Parameter Tuning on Training Data: ", scr) predictions = rsearch.predict_proba(X_test_transformed) pred_scr = round(log_loss(y_test, predictions), 5) print("2019 Score: ", pred_scr) if X_unknown is not None: X_final = pd.concat([X_train, X_test]) X_final = RFECV.transform(X_final) y_final = pd.concat([y_train, y_test]) X_unknown = RFECV.transform(X_unknown) rsearch.fit(X_final, y_final) predictions_final = rsearch.predict(X_unknown) else: predictions_final = [] return predictions, predictions_final
# Before feature selection #Logistic Regression model = LogisticRegression(random_state=0) df = df.append(performanceEvaluation(model, X, y, cv, len(X[0][:]))) #Random Forest model = RandomForestClassifier(n_estimators = 100, random_state=0) df = df.append(performanceEvaluation(model, X, y, cv, len(X[0][:]))) ## Feature selection: Recursive Feature Elimination #Logistic Regression model = LogisticRegression(random_state=0) rfecv = RFECV(estimator=model, step=1, cv=cv,scoring='accuracy') rfecv.fit(X, y) X_new = rfecv.transform(X) df_rfe = df_rfe.append(performanceEvaluation(model, X_new, y, cv, len(X_new[0][:]))) # ============================================================================= #Comment out to get more insights # #opt = rfecv.n_features_ # #num = rfecv.support_ # #sc = rfecv.grid_scores_ # #est = rfecv.estimator_ # ============================================================================= #Random Forest model = RandomForestClassifier(n_estimators = 100, random_state=0) rfecv = RFECV(estimator=model, step=1, cv=cv,scoring='accuracy')
X = np.array([exposure.equalize_adapthist(item[0].reshape(lats.shape).T, clip_limit=0.03).ravel() for item\ in sat_data]).T Y = mask.ravel() print("Train dataset is formed.") # if layers aresn't defined apply recursive feature elimination procedure to the full set of features/layers features_mask = None if not any(SAT_LAYERS): print("Performing recursive feature ellimination...") rfecv_clf = RFECV(rf_clf, step=1, min_features_to_select=2, scoring='f1', n_jobs=3, verbose=1) rfecv_clf.fit(X, Y) rf_clf = rfecv_clf.estimator_ print("Selected features are: ", rfecv_clf.support_) features_mask = rfecv_clf.support_ else: print("Training the classifier... ") rf_clf.fit(X, Y) if features_mask is None: scores = cross_val_score(rf_clf, X, Y, cv=5) else: scores = cross_val_score(rf_clf, rfecv_clf.transform(X), Y, cv=5) print("Score estimations are: ", scores)
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd") # Сгенерировать матрицу признаков, вектор целей и истинные коэффициенты features, target = make_regression(n_samples = 10000, n_features = 100, n_informative = 2, random_state = 1) # Создать объект линейной регрессии ols = linear_model.LinearRegression() # Рекурсивно устранить признаки rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error") rfecv.fit(features, target) rfecv.transform(features) # In[33]: # Количество самых лучших признаков rfecv.n_features_ # In[34]: # Какие категории самые лучшие rfecv.support_
selector.fit(train_set, train_labels) plt.plot(selector.grid_scores_) plt.xlabel("Number of Feature") plt.ylabel("Macro F1 Score") plt.title("Feature Selection Scores") print(selector.n_features_) rankings = pd.DataFrame({ "feature": list(train_set.columns), "rank": list(selector.ranking_) }).sort_values("rank") rankings.head(10) train_selected = selector.transform(train_set) test_selected = selector.transform(test_set) selected_features = train_set.columns[np.where(selector.ranking_ == 1)] train_selected = pd.DataFrame(train_selected, columns=selected_features) test_selected = pd.DataFrame(test_selected, columns=selected_features) model_results = cv_model(train_selected, train_labels, LinearSVC(), "LSVC-SEL", model_results) model_results = cv_model(train_selected, train_labels, GaussianNB(), "GNB-SEL", model_results) model_results = cv_model( train_selected, train_labels, MLPClassifier(hidden_layer_sizes=(32, 64, 128, 64, 32)), "MLP-SEL", model_results) model_results = cv_model(train_selected, train_labels,
from sklearn.metrics import accuracy_score, roc_auc_score df1 = pd.read_csv('EventDetectionData.csv') scores = [] for i in range(150, 200): score = [] X_train, X_test, y_train, y_test = train_test_split( df1.iloc[:, 1:i], df1['target'], test_size=0.3, random_state=69) # 70% training and 30% test log = LogisticRegression() rfecv = RFECV(estimator=log, step=1, cv=5, scoring='roc_auc') X_train_new = rfecv.fit_transform(X_train, y_train) X_test_new = rfecv.transform(X_test) j = rfecv.n_features_ C_range = 10.**np.arange(-5, 1) penalty_options = ['l1', 'l2'] param_grid = dict(C=C_range, penalty=penalty_options) grid = GridSearchCV(log, param_grid, cv=5, scoring='roc_auc') grid.fit(X_train_new, y_train) y_train_pred = grid.predict(X_train_new)
labels = np.array(labels) features = np.array(features) ### RFECV method ### RFECV method and try 4 different classifier method: logistic regression, Decision Tree, Random Forrest and Adaptive boosting ### logistic regression clf_Log = LogisticRegression(random_state = 14, C= 5, class_weight='balanced') selectorCV_Log = RFECV(clf_Log, step=1, cv=5, scoring = 'f1') selectorCV_Log.fit(features, labels) refcv_figure(selectorCV_Log) clf = selectorCV_Log.estimator_ features_new = selectorCV_Log.transform(features) test_clf(clf, labels, features_new, folds = 1000) param_grid = {"C": [0.01, 0.1, 1, 5, 10, 100, 1000], "penalty" : ['l1', 'l2'] } clf_Log_searchCV = GridSearchCV(clf, param_grid, scoring ='f1', cv=10) clf_Log_searchCV.fit(features, labels) print clf_Log_searchCV.best_estimator_ test_clf(clf_Log_searchCV.best_estimator_, labels, features_new, folds = 1000) ### Decision Tree clf_DT = DecisionTreeClassifier(random_state= 32, class_weight='balanced')