X.info() #get all the labels y1 = data[['effect']] ######################################################################################################################## # prediction with FeatureBoost ######################################################################################################################## #Using XGBoost for featureboost, each feature get a score. #Using SelectFromModel, we can extract features, whose score is higher than average score. model = XGBClassifier() model.fit(X, y1.values.ravel()) print('feature_importances of all') print(model.feature_importances_) selection = SelectFromModel(model, prefit=True) select_X = selection.transform(X) select_X = pd.DataFrame(select_X) print(select_X) y1.info() print(y1) #split training data and test data, The ratio is 4: 1 X_train, X_test, y_train, y_test = train_test_split(select_X, y1, test_size=0.2, random_state=0) print(X_train.shape) print(y_train.shape) print(X_test.shape)
#特征值缩放-标准化,决策树模型不依赖特征缩放 #stdsc=StandardScaler() #X_train_std=stdsc.fit_transform(X_train) #X_test_std=stdsc.fit_transform(X_test) #随机森林评估特征重要性 feat_labels = df_all.columns[1:-1] forest = RandomForestClassifier(n_estimators=4, max_depth=10, n_jobs=-1, random_state=0) forest.fit(X_train, y_train) score = forest.score(X_test, y_test) forest_y_score = forest.predict_proba(X_test) importances = forest.feature_importances_ model = SelectFromModel(forest, threshold=0.004, prefit=True) X_new = model.transform(X_train) forest.fit(X_new, y_train) indices = np.argsort(importances)[::-1] for f in range(X_train.shape[1]): #给予10000颗决策树平均不纯度衰减的计算来评估特征重要性 print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) #可视化特征重要性-依据平均不纯度衰减 plt.title('Feature Importance-RandomForest') plt.bar(range(X_train.shape[1]), importances[indices], color='lightblue', align='center') plt.xticks(range(X_train.shape[1]), feat_labels, rotation=90)
madelon_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=madelon_base, random_state=55) adult_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=adult_base, random_state=55) OF_booster = AdaBoostClassifier(algorithm='SAMME', learning_rate=1, base_estimator=OF_base, random_state=55) pipeM = Pipeline([('Scale', StandardScaler()), ('Cull1', SelectFromModel(RandomForestClassifier(random_state=1), threshold='median')), ('Cull2', SelectFromModel(RandomForestClassifier(random_state=2), threshold='median')), ('Cull3', SelectFromModel(RandomForestClassifier(random_state=3), threshold='median')), ('Cull4', SelectFromModel(RandomForestClassifier(random_state=4), threshold='median')), ('Boost', madelon_booster)]) pipeA = Pipeline([('Scale', StandardScaler()), ('Boost', adult_booster)]) # madelon_clf = basicResults(pipeM, madelon_trgX, madelon_trgY, madelon_tstX,
from sklearn.datasets import load_iris from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFromModel X, Y = load_iris(return_X_y=True) print(X.shape) model = ExtraTreesClassifier(n_estimators=50) model.fit(X, Y) print(model.feature_importances_) sfModel = SelectFromModel(model, prefit=True) X1 = sfModel.transform(X) print(X1.shape) print(X1)
lr.fit(X_train_selected, y_train) print("Score with only selected features: %f" % lr.score(X_test_selected, y_test)) # %% [markdown] {"deletable": true, "editable": true} # ### Model-based Feature Selection # A somewhat more sophisticated method for feature selection is using a supervised machine learning model and selecting features based on how important they were deemed by the model. This requires the model to provide some way to rank the features by importance. This can be done for all tree-based models (which implement ``get_feature_importances``) and all linear models, for which the coefficients can be used to determine how much influence a feature has on the outcome. # # Any of these models can be made into a transformer that does feature selection by wrapping it with the ``SelectFromModel`` class: # %% {"deletable": true, "editable": true} from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold="median") # %% {"deletable": true, "editable": true} select.fit(X_train, y_train) X_train_rf = select.transform(X_train) print(X_train.shape) print(X_train_rf.shape) # %% {"deletable": true, "editable": true} mask = select.get_support() # visualize the mask. black is True, white is False plt.matshow(mask.reshape(1, -1), cmap='gray_r') # %% {"deletable": true, "editable": true} X_test_rf = select.transform(X_test)
# "PRI.ACTIVE.ACCTS", "PRI.OVERDUE.ACCTS", "PRI.CURRENT.BALANCE", "PRI.SANCTIONED.AMOUNT", # "PRI.DISBURSED.AMOUNT", "SEC.NO.OF.ACCTS", "SEC.ACTIVE.ACCTS", "SEC.OVERDUE.ACCTS", "SEC.CURRENT.BALANCE", # "SEC.SANCTIONED.AMOUNT", "SEC.DISBURSED.AMOUNT", "PRIMARY.INSTAL.AMT", "SEC.INSTAL.AMT", # "NEW.ACCTS.IN.LAST.SIX.MONTHS", "DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS"] # model.get_booster().feature_names = x_FN # plot_importance(model.get_booster()) # plt.show() # print(thresholds) n = 0 b_acc = 0 thresholds = np.sort(model.feature_importances_) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) selection_model = XGBClassifier() selection_model.fit(select_x_train, y_train) select_x_test = selection.transform(x_test) y_predict = selection_model.predict(select_x_test) acc = selection_model.score(select_x_test, y_test) acc_score = accuracy_score(y_test, y_predict) if acc > b_acc: n = select_x_train.shape[1] b_acc = acc L_selection = selection print("Thresh=%.3f, n=%d, acc: %.15f%%, acc_score: %.15f%%" %
trainRows = [] for idx in trainidx: trainRows.append(orgMatrix[(mc + nmc)[idx]]) trainingMat = np.array(trainMatrix) # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=5, max_features=expectedFeatureCount) #forest = LinearSVC() forest.fit(trainingMat, trainRes) model = SelectFromModel(forest, prefit=True) X_new = model.transform(Dall) print(X_new.shape) print("Training") for org in mc + nmc: if not org in trainOrgs: continue orgidx = (mc + nmc).index(org) predClass = forest.predict(Dall[orgidx, ]) predProbs = 0 #forest.predict_proba(Dall[orgidx,])
#基于树模型 from sklearn.datasets import load_iris iris = load_iris() ix, iy = iris.data, iris.target from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier from sklearn.feature_selection import SelectFromModel model1 = ExtraTreesClassifier() model2 = GradientBoostingClassifier() model1.fit(ix, iy) model2.fit(ix, iy) model1.feature_importances_ model2.feature_importances_ clf1 = SelectFromModel(model1, prefit=True) clf2 = SelectFromModel(model2, prefit=True) clf1.get_support() clf2.get_support() #--- # sklearn 交叉验证 from sklearn.cross_validation import cross_val_score #cross_val_score(model, X, y, cv=10) from sklearn.cross_validation import cross_val_predict #cross_val_predict(model, X, y, cv=10) from sklearn.cross_validation import LeaveOneOut #scores = cross_val_score(model, X, y, cv=LeaveOneOut(len(X))) # --- from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import roc_auc_score import h5py file = h5py.File('/Users/Tina/Documents/ml_proj1_data.h5', 'r') file.keys() xtest = file['xtest'] xtrain = file['xtrain'] xval = file['xval'] ytrain = file['ytrain'] yval = file['yval'] #______________________________________Feature selection from sklearn.svm import LinearSVC from sklearn.feature_selection import SelectFromModel lsvc = LinearSVC(C=0.05, penalty="l1", dual=False).fit(xtrain, ytrain) model = SelectFromModel(lsvc, prefit=True) X_train_new = model.transform(xtrain) X_val_new = model.transform(xval) X_test_new = model.transform(xtest) #_______________________________________scale all inputs : train,val & test in range [-1,1] for svm from sklearn import preprocessing import numpy as np max_abs_scaler = preprocessing.MaxAbsScaler() xtrainScaled = max_abs_scaler.fit_transform(X_train_new) max_abs_scaler2 = preprocessing.MaxAbsScaler() xvalScaled = max_abs_scaler2.fit_transform(X_val_new) max_abs_scaler3 = preprocessing.MaxAbsScaler() xtestScaled = max_abs_scaler3.fit_transform(X_test_new)
from sklearn.feature_selection import SelectKBest, SelectFromModel from sklearn.ensemble import RandomForestClassifier import numpy as np rng = np.random.RandomState(1) X = rng.randint(0, 2, (200, 20)) y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0) fs_univariate = SelectKBest(k=10) fs_modelbased = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold='median') fs_univariate.fit(X, y) print('Features selected by univariate selection:') print(fs_univariate.get_support()) print('') fs_modelbased.fit(X, y) print('Features selected by model-based selection:') print(fs_modelbased.get_support())
if key == 'xgboost': model = XGBClassifier(**params_XGboost[str(i)]) elif key == 'catboost': model = CatBoostClassifier(**paramsCatBoost[str(i)]) y = target.iloc[:, i] train_X, val_X, train_y, val_y = \ train_test_split(X_train, y, random_state=SEED, shuffle=True) model.fit(train_X.values, train_y.values) perm = PermutationImportance(model, cv=5, scoring='roc_auc', random_state=SEED) perm.fit(val_X.values, val_y.values) sel = SelectFromModel(perm, threshold=value['threshold'], prefit=True) X_train_transformed = sel.transform(X_train) X_test_transformed = sel.transform(X_test) prediction, cv_scores_mean = train_and_predict(model, X_train_transformed, y.values, X_test_transformed, cv) cv_scores.append(cv_scores_mean) predictions.append(prediction) print(round(np.array(cv_scores).mean(), 5)) write_to_submission_file(predictions, ID, value['filename']) #simple blending
def main(): data = pd.read_csv( 'selfie_dataset.txt', sep=" ", header=None, names=[ "Nome", "Rate", "partial_faces", "is_female", "baby", "child", "teenager", "youth", "middle_age", "senior", "white", "black", "asian", "oval_face", "round_face", "heart_face", "smiling", "mouth_open", "frowning", "wearing_glasses", "wearing_sunglasses", "wearing_lipstick", "2tongue_out0", "duck_face", "black_hair", "blond_hair", "brown_hair", "red_hair", "curly_hair", "straight_hair", "braid_hair", "showing_cellphone", "using_earphone", "using_mirror", "wearing_hat", "braces", "harsh_lighting", "dim_lighting" ]) labels1 = np.array(data['Rate']) mx = max(labels1) mn = min(labels1) labels = [] for i in labels1: if ((i >= 0) and (i < (mx + mn) / 3)): labels.append(1) elif ((i >= (mx + mn) / 5) and (i < 2 * (mx + mn) / 5)): labels.append(2) elif ((i >= 2 * (mx + mn) / 5) and (i < 3 * (mx + mn) / 5)): labels.append(3) elif ((i >= 3 * (mx + mn) / 5) and (i < 4 * (mx + mn) / 5)): labels.append(4) elif ((i >= 4 * (mx + mn) / 5) and (i < 5 * (mx + mn) / 5)): labels.append(5) features1 = data.drop("Rate", axis=1) features2 = features1.drop("Nome", axis=1) feature_list = list(features2.columns) features = np.array(features2) train_features, test_features, train_labels, test_labels = train_test_split( features, labels, test_size=0.1, random_state=0) print('The shape of our train_features is:', train_features.shape) print('The shape of our test_features is:', test_features.shape) isTrained = False min_importance = 0.04 n_estimators = 200 retrain = True if (isTrained): if (retrain): crf = joblib.load("classifier.pkl") rf = SelectFromModel(crf, threshold=min_importance) rf.fit(train_features, train_labels) train_features = rf.transform(train_features) test_features = rf.transform(test_features) print('The shape of our important_train_features is:', train_features.shape) print('The shape of our important_test_features is:', test_features.shape) rf_important = RandomForestClassifier(n_estimators=n_estimators, random_state=1) rf_important.fit(train_features, train_labels) rf = rf_important print(rf_important) print("\n\n") predictions = rf_important.predict(test_features) importances = list(rf_important.feature_importances_) else: rf = joblib.load("classifier.pkl") print(rf) print("\n\n") predictions = rf.predict(test_features) importances = list(rf.feature_importances_) else: rf = RandomForestClassifier(n_estimators=n_estimators, criterion="entropy", random_state=2) rf.fit(train_features, train_labels) joblib.dump(rf, 'classifier.pkl') print(rf) print("\n\n") predictions = rf.predict(test_features) importances = list(rf.feature_importances_) print('Mean Absolute Error:', mean_absolute_error(test_labels, predictions)) print('Train Accuracy:', rf.score(train_features, train_labels), '%') print('Test Accuracy:', rf.score(test_features, test_labels), '%') print("\n\n") print("Importances: \n") feature_importances = [ (feature, round(importance, 4)) for feature, importance in zip(feature_list, importances) ] feature_importances = sorted(feature_importances, key=lambda x: x[1], reverse=True) for pair in feature_importances: print('{} : {}'.format(*pair)) print()
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score from sklearn.neural_network import MLPClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.svm import SVC from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score # Path names main_path = root_path byte1g_matrix_path = os.path.join(main_path, byte1g_matrix_path) byte2g_matrix_path = os.path.join(main_path, byte2g_matrix_path) byte3g_matrix_path = os.path.join(main_path, byte3g_matrix_path) rfc = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1) sfm = SelectFromModel(rfc, threshold=2e-5) def roc_auc_score_multiclass(actual_class, pred_class, average="macro"): # creating a set of all the unique classes using the actual class list unique_class = set(actual_class) roc_auc_dict = {} for per_class in unique_class: # creating a list of all the classes except the current class other_class = [x for x in unique_class if x != per_class] # marking the current class as 1 and all other classes as 0 new_actual_class = [0 if x in other_class else 1 for x in actual_class] new_pred_class = [0 if x in other_class else 1 for x in pred_class] # using the sklearn metrics method to calculate the roc_auc_score
# print results print('accuracy =', acc) print(cr) print('confusion matrix:') print(cm) ## plot results thresh = cm.max() / 2 cmdf = DataFrame(cm, index=['NoPain', 'Pain'], columns=['NoPain', 'Pain']) sns.heatmap(cmdf, cmap='RdBu_r') plt.xlabel('Predicted') plt.ylabel('Observed') for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j + 0.5, i + 0.5, format(cm[i, j], 'd'), horizontalalignment="center", color="white") model = SelectFromModel(logreg, prefit=True) X_new = model.transform(X) print(X_new.shape) selector = RFE(logreg, 1) selector = selector.fit(X_train, y_train) selector.support_ order = selector.ranking_ order print(order)
_, _, accuracy3 = model.evaluate(X_test, y_test) print('Accuracy: %.2f' % (accuracy3 * 100)) # In[ ]: plt.plot(history.history['mean_squared_error']) plt.show() plt.plot(history.history['accuracy'], color='red') plt.show() # ## Decision Tree # In[ ]: sel = SelectFromModel(RandomForestClassifier(n_estimators=100)) sel.fit(X, y) sel.get_support() selected_columns = X.columns[(sel.get_support())] X_new = X[selected_columns] length = len(selected_columns) # In[ ]: pd.Series(sel.estimator_.feature_importances_.ravel()).hist() # In[ ]: plt.scatter(x=X_new['MWG'], y=X_new['NWG'], c=y, cmap='rainbow') plt.legend(y, prop={'size': 5})
] # For undersample # Create a random forest classifier clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) # Train the classifier clf.fit(X_train_undersample, y_train_undersample) # Print the name and gini importance of each feature for feature in zip(feat_labels, clf.feature_importances_): print(feature) # Create a selector object that will use the random forest classifier to identify # features that have an importance of more than 0.15 sfm = SelectFromModel(clf, threshold=0.15) # Train the selector sfm.fit(X_train_undersample, y_train_undersample) # Print the names of the most important features for feature_list_index in sfm.get_support(indices=True): print(feat_labels[feature_list_index]) # Transform the data to create a new dataset containing only the most important features # Note: We have to apply the transform to both the training X and test X data. X_important_train_undersample = sfm.transform(X_train_undersample) X_important_test_undersample = sfm.transform(X_test_undersample) # Create a new random forest classifier for the most important features clf_important = RandomForestClassifier(n_estimators=10000,
y_train = y_df[msk] x_train = X_df[msk] y_test = y_df[~msk] x_test = X_df[~msk] ''' train = df[msk] test = df[~msk] y_train = train.iloc[:, 0] x_train = train.iloc[:, 1:] y_test = test.iloc[:, 0] x_test = test.iloc[:, 1:] ''' # 101011110111 2048 rf = RandomForestClassifier(max_depth=100, n_estimators=1000) embeded_rf_selector = SelectFromModel(rf, max_features=2048) embeded_rf_selector.fit(X, y) embeded_rf_support = embeded_rf_selector.get_support() embeded_rf_feature = X.loc[:, embeded_rf_support].columns.tolist() print(str(len(embeded_rf_feature)), 'selected features') ''' rf.fit(x_train, y_train.values.ravel()) y_pred_rf = rf.predict(x_test) predictions_rf = y_pred_rf accuracy_rf = accuracy_score(y_test, predictions_rf) f1_rf = f1_score(y_test, predictions_rf) precision_rf = precision_score(y_test, predictions_rf) print(' Feature Set | Accuracy | F1 measure | Precesion') print('RandomForest |', accuracy_rf, '|', f1_rf, '|', precision_rf, '|', )