def model_feature_selection(clf_c,best_clf, best_predictions,X_train, y_train, X_test, y_test): # TODO: Train the supervised model on the training set using .fit(X_train, y_train) model = None # TODO: Extract the feature importances using .feature_importances_ importances = clf_c.feature_importances_ # Plot vs.feature_plot(importances, X_train, y_train) # Reduce the feature space X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]] X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]] # Train on the "best" model found from grid search earlier fit_start = dt.datetime.now() clf = (clone(best_clf)).fit(X_train_reduced, y_train) fit_end = dt.datetime.now() fit_time = fit_end - fit_start # Make new predictions pred_start = dt.datetime.now() reduced_predictions = clf.predict(X_test_reduced) pred_end = dt.datetime.now() pred_time = pred_end - pred_start # Report scores from the final model using both versions of data print("Final Model trained on full data\n------") print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta=0.5))) print("\nFinal Model trained on reduced data\n------") print("Accuracy on testing data: {:.4f}".format(accuracy_score(y_test, reduced_predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, reduced_predictions, beta=0.5))) print('time taken for training is {0}'.format(fit_time)) print('time taken for predicting is {0}'.format(pred_time))
def modelfit(alg, dtrain, y_train, dtest, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_parameters = alg.get_xgb_params() xgtrain = xgb.DMatrix(dtrain.values, label=y_train.values) cvresult = xgb.cv(xgb_parameters, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='rmse', early_stopping_rounds=early_stopping_rounds, show_stdv=False) print(cvresult) alg.set_params(n_estimators=cvresult.shape[0]) alg.fit(dtrain, y_train, eval_metric='rmse') dtrain_prediction = alg.predict(dtrain) dtest_prediction = alg.predict(dtest) # print model report print("\nModel Report") print("Train RMSE : %.4g" % mean_squared_error(y_train.values, dtrain_prediction)**0.5) print("Test RMSE : %.4g" % mean_squared_error(y_test.values, dtest_prediction) ** 0.5) # feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False) # feat_imp.plot(kind='bar', title='Feature Importance') # plt.ylabel('Feature Importance Score') # plt.show() plot_importance(alg) plt.show() importances = alg.feature_importances_ vs.feature_plot(importances, dtrain, y_train) return dtrain_prediction, dtest_prediction
def Model_Tuning(features_train, labels_train, features_test, labels_test): """ perform a grid search optimization for the model over the entire training set (features_train and labels_train) by tuning at least one parameter to improve upon the untuned model's F-score. """ clf = DecisionTreeClassifier() parameters = {'min_samples_split': [2, 4, 6, 8], 'min_samples_leaf': [1, 2, 3, 4]} scorer = make_scorer(fbeta_score, beta = 0.5) grid_obj = GridSearchCV(clf, parameters, scoring = scorer) grid_fit = grid_obj.fit(features_train, labels_train) best_clf = grid_fit.best_estimator_ predictions = (clf.fit(features_train, labels_train)).predict(features_test) best_predictions = best_clf.predict(features_test) # Report the before-and-afterscores print("Unoptimized model\n------") print("Accuracy score on testing data: {:.4f}".format(accuracy_score(labels_test, predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(labels_test, predictions, beta = 0.5))) print("\nOptimized Model\n------") print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(labels_test, best_predictions))) print("Final F-score on the testing data: {:.4f}".format(fbeta_score(labels_test, best_predictions, beta = 0.5))) # Feature Relevance Observation model = ExtraTreesClassifier() model.fit(features_train, labels_train) importances = model.feature_importances_ vs.feature_plot(importances, features_train, labels_train) # Feature selection X_train_reduced = features_train[features_train.columns.values[(np.argsort(importances)[::-1])[:5]]] X_test_reduced = features_test[features_test.columns.values[(np.argsort(importances)[::-1])[:5]]] # Train on the "best" model found from grid search earlier clf = (clone(best_clf)).fit(X_train_reduced, labels_train) # Make new predictions reduced_predictions = clf.predict(X_test_reduced) # Report scores from the final model using both versions of data print("Final Model trained on full data\n------") print("Accuracy on testing data: {:.4f}".format(accuracy_score(labels_test, best_predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(labels_test, best_predictions, beta = 0.5))) print("\nFinal Model trained on reduced data\n------") print("Accuracy on testing data: {:.4f}".format(accuracy_score(labels_test, reduced_predictions))) print("F-score on testing data: {:.4f}".format(fbeta_score(labels_test, reduced_predictions, beta = 0.5)))
i = i + 1 j = 0 else: j += 1 # TODO: Import a supervised learning model that has 'feature_importances_' clf = GradientBoostingClassifier(random_state=1990) # TODO: Train the supervised model on the training set model = clf.fit(X_train, y_train) # TODO: Extract the feature importances importances = model.feature_importances_ # Plot vs.feature_plot(importances, X_train, y_train) # Import functionality for cloning a model from sklearn.base import clone # Reduce the feature space X_train_reduced = X_train[X_train.columns.values[( np.argsort(importances)[::-1])[:5]]] X_test_reduced = X_test[X_test.columns.values[( np.argsort(importances)[::-1])[:5]]] # Train on the "best" model found from grid search earlier clf = (clone(best_clf)).fit(X_train_reduced, y_train) # Make new predictions reduced_predictions = clf.predict(X_test_reduced)
model = AdaBoostClassifier(random_state=0) model = model.fit(X_train_LE, y_train_LE) y_pred_LE = model.predict(X_test_LE) LE_score_acc = accuracy_score(y_test_LE, y_pred_LE) LE_score_fbeta = fbeta_score(y_test_LE, y_pred_LE, beta=0.5) print("LE Acc: ", LE_score_acc) print("LE f: ", LE_score_fbeta) model.fit(X_train_OHE, y_train_OHE) y_pred_OHE = model.predict(X_test_OHE) OHE_score_acc = accuracy_score(y_test_OHE, y_pred_OHE) OHE_score_fbeta = fbeta_score(y_test_OHE, y_pred_OHE, beta=0.5) print("OHE Acc: ", OHE_score_acc) print("OHE f: ", OHE_score_fbeta) feature_importannces = model.feature_importances_ vs.feature_plot(feature_importannces, X_train_OHE, y_train_OHE) from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer, f1_score # TODO: Initialize the classifier clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3)) # TODO: Create the parameters list you wish to tune, using a dictionary if needed. # HINT: parameters = {'parameter_1': [value1, value2], 'parameter_2': [value1, value2]} parameters = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1, 1]} # TODO: Make an fbeta_score scoring object using make_scorer() scorer = make_scorer(fbeta_score, beta=0.5) # TODO: Perform grid search on the classifier using 'scorer' as the scoring method using GridSearchCV() grid_obj = GridSearchCV(clf, parameters, scoring=scorer)
df = pd.DataFrame(grid_fit.grid_scores_).sort_values('mean_validation_score', ascending=False).tail() print(df) # Import a supervised learning model that has 'feature_importances_' from sklearn.tree import DecisionTreeClassifier # Train the supervised model on the training set model = tree.DecisionTreeClassifier(criterion="gini", random_state=0) model.fit(X_train, y_train['>50K']) # Extract the feature importances importances = model.feature_importances_ # Plot vs.feature_plot(importances, X_train, y_train['>50K']) # Import functionality for cloning a model from sklearn.base import clone # Reduce the feature space X_train_reduced = X_train[X_train.columns.values[( np.argsort(importances)[::-1])[:5]]] X_test_reduced = X_test[X_test.columns.values[( np.argsort(importances)[::-1])[:5]]] # Train on the "best" model found from grid search earlier clf = (clone(best_clf)).fit(X_train_reduced, y_train['>50K']) # Make new predictions reduced_predictions = clf.predict(X_test_reduced)
# Report the before-and-afterscores print "\nUnoptimized model" print "Accuracy on testing data: {:.4f}".format( accuracy_score(y_test, predictions)) print "F1 score on testing data: {:.4f}".format(f1_score(y_test, predictions)) print "\nOptimized Model" print "Accuracy on testing data: {:.4f}".format( accuracy_score(y_test, optimal_predictions)) print "F1 score on testing data: {:.4f}".format( f1_score(y_test, optimal_predictions)) print "\nThe optimized configuration of the decision tree:" print optimal_clf #%% Finding the top important features in the model top_features = clf.feature_importances_ vs.feature_plot(top_features, X_train, y_train) #%% Train a new model only with top five features X_train_reduced = X_train[X_train.columns.values[( np.argsort(top_features)[::-1])[:5]]] X_test_reduced = X_test[X_test.columns.values[( np.argsort(top_features)[::-1])[:5]]] # Reuse previous optimal parameters and train with top features clf = (clone(optimal_clf)).fit(X_train_reduced, y_train) # New prediction new_predictions = clf.predict(X_test_reduced) # Report scores from the final model using both versions of data print "Model trained on full data"
def plot_learn_curve(X_train, y_train, X_test, y_test, reglist): for e in reglist: print e e.fit(X_train, y_train) print "Regressor R2 score on the test set: {:.4f}".format( e.score(X_test, y_test)) print('size of the test set (x,y)', np.shape(X_test), np.shape(y_test)) # TODO: Use learning_curve imported above to create learning curves for both the # training data and testing data. You'll need 'size', 'cv' and 'score' from above. train_sizes, train_scores, test_scores = learning_curve( e, X_train, y_train, cv=KFold(n_splits=10), scoring=make_scorer(r2_score), train_sizes=np.linspace(.1, 1, 20), n_jobs=8) # TODO: Plot the training curves and the testing curves # Use plt.plot twice -- one for each score. Be sure to give them labels! plt.figure(figsize=(10, 7)) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, color="g", label="Cross-validation score") # Plot aesthetics plt.ylim(-1.1, 1.1) plt.ylabel("R2 Score") plt.xlabel("Training Points") plt.legend(bbox_to_anchor=(1.0, 1.15)) plt.show() try: importances = e.feature_importances_ vs.feature_plot(importances, X_train, y_train) except AttributeError: print('No feature importance avalable for this learner') print('') return
# Perform grid search on the classifier using 'scorer' as the scoring method grid_obj = GridSearchCV(clf,parameters,scoring=scorer) # Fit the grid search object to the training data and find the optimal parameters grid_fit = grid_obj.fit(X_train_reduced, y_train.values.ravel()) # Get the estimator best_clf = grid_fit.best_estimator_ # Make predictions using the unoptimized and model predictions = (clf.fit(X_train_reduced, y_train.values.ravel())).predict(X_test_reduced) best_predictions = best_clf.predict(X_test_reduced) # Report the before-and-afterscores print "Unoptimized model\n------" print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test.values.ravel(), predictions)) print "F-score on testing data: {:.4f}".format(fbeta_score(y_test.values.ravel(), predictions, beta = 0.5)) print "\nOptimized Model\n------" print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test.values.ravel(), best_predictions)) print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test.values.ravel(), best_predictions, beta = 0.5)) # Train the supervised model on the training set new_clf = AdaBoostClassifier() model = new_clf.fit(X_train, y_train.values.ravel()) # Extract the feature importances importances = model.feature_importances_ # Plot vs.feature_plot(importances, X_train, y_train.values.ravel())
def plot_shuffle_split_score(features, labels, features2, labels2, reglist, n_Splits, earlyStopRounds): sscv = ShuffleSplit(n_splits=n_Splits, test_size=.25, random_state=None) for e in reglist: score_l = [] print( '-------------------------------------------------------------------------------------------------------' ) print( '-------------------------------------------------------------------------------------------------------' ) print e i = 0 for train_index, test_index in sscv.split(features): print( '---------------------------------------------------------------------------------------------------' ) print('ShuffledSplit iteration {} of {}'.format(i + 1, n_Splits)) i += 1 X_train, X_test = features.loc[train_index], features.loc[ test_index] y_train, y_test = labels.loc[train_index], labels.loc[test_index] y_train = y_train.values.ravel( ) # change column vector to 1d array to avoid conversion warning @ regressor.fit() regressor = e test_set = [(X_test, y_test), (features2, labels2)] #test_set = [(features2, labels2)] start = time.time() if type(e).__name__ in ("XGBRegressor", "MLPRegressor"): if earlyStopRounds > 0: regressor.fit(X_train, y_train, early_stopping_rounds=earlyStopRounds, eval_metric='rmse', eval_set=test_set, verbose=False) elapsed = time.time() - start elif earlyStopRounds == 0: print 'earlyStop disabled' regressor.fit(X_train, y_train, eval_metric='rmse') elapsed = time.time() - start results = regressor.evals_result() epochs = len(results['validation_0']['rmse']) x_axis = range(0, epochs) # plot regression error fig, ax = plt.subplots() ax.plot(x_axis, results['validation_0']['rmse'], label='Validation') ax.plot(x_axis, results['validation_1']['rmse'], label='Test') ax.legend() plt.xlabel('number of epochs') plt.ylabel('Regression RMSE') plt.title('XGBReg. RMSE') plt.show() else: regressor.fit(X_train, y_train) elapsed = time.time() - start print("time to fit: %f" % (elapsed)) score = regressor.score(X_test, y_test) score_l.append(score) print('') print "Regressor R2 score on the validation set: {:.4f}".format(score) print( '---------------------------------------------------------------------' ) print('size of the training set (features, labels)', np.shape(X_train), np.shape(y_train)) print('size of the validation set (features, labels)', np.shape(X_test), np.shape(y_test)) print('size of the test set (features, labels)', np.shape(features2), np.shape(labels2)) print( '---------------------------------------------------------------------' ) print('Variance of the train/valid. set: {}'.format( labels['sr_highres'].var())) print('Variance of the test set: {}'.format( labels2['sr_highres'].var())) print('') preds = regressor.predict(features) preds = pd.DataFrame(preds) preds.rename(columns={0: 'sr_predicted'}, inplace=True) plt.figure(figsize=(6, 3)) plt.plot(score_l) plt.ylabel("R2 Score") plt.xlabel("number of ShuffleSplits") plt.show() fig = plt.figure(figsize=(18, 3)) labels['sr_highres'].plot() preds['sr_predicted'].plot() plt.ylim(-4, 4) plt.title('validation data: labels vs. predictions') plt.legend(loc='best') plt.show() print('validation set r2 score:', r2_score(labels['sr_highres'], preds['sr_predicted'])) print('validation set mean squared error: {0:.2f}%'.format( mean_squared_error(labels['sr_highres'], preds['sr_predicted']) * 100)) lmean = labels['sr_highres'].mean() predmean = preds['sr_predicted'].mean() devmean = -100 / lmean * (lmean - predmean) print( 'sr mean: {} | predicted mean: {} | pred. deviation from sr: {}%'. format(lmean, predmean, devmean)) preds2 = regressor.predict(features2) preds2 = pd.DataFrame(preds2) preds2.rename(columns={0: 'sr_predicted'}, inplace=True) fig = plt.figure(figsize=(18, 3)) labels2['sr_highres'].plot() preds2['sr_predicted'].plot() plt.ylim(-4, 4) plt.title('test data: labels vs. predictions') plt.legend(loc='best') plt.show() print('test set r2 score:', r2_score(labels2['sr_highres'], preds2['sr_predicted'])) print('test set mean squared error: {0:.2f}%'.format( mean_squared_error(labels2['sr_highres'], preds2['sr_predicted']) * 100)) lmean2 = labels2['sr_highres'].mean() predmean2 = preds2['sr_predicted'].mean() devmean2 = -100 / lmean2 * (lmean2 - predmean2) print( 'sr mean: {} | predicted mean: {} | pred. deviation from sr: {}%'. format(lmean2, predmean2, devmean2)) try: importances = regressor.feature_importances_ vs.feature_plot(importances, X_train, y_train) except AttributeError: print('') print('No feature importance available for this learner') print('') print('') if type(e).__name__ == "XGBRegressor": fig, ax = plt.subplots(1, 1, figsize=(8, 13)) plot_importance(regressor, ax=ax) plt.show() return regressor, preds, preds2
def plot_kfold_split_score(features, labels, valid_features, valid_labels, reglist, n_Splits): kfold = KFold(n_splits=n_Splits, random_state=0, shuffle=True) for e in reglist: score_l = [] print e for train_index, test_index in kfold.split(features): X_train, X_test = features.loc[train_index], features.loc[ test_index] y_train, y_test = labels.loc[train_index], labels.loc[test_index] y_train = y_train.values.ravel( ) # change column vector to 1d array to avoid conversion warning @ regressor.fit() regressor = e start = time.time() regressor.fit(X_train, y_train) elapsed = time.time() - start print("time to fit: %f" % (elapsed)) score = regressor.score(X_test, y_test) score_l.append(score) print('') print "Regressor R2 score on the validation set: {:.4f}".format(score) # print('size of the training set (x,y)', np.shape(X_train), np.shape(y_train)) # print('size of the test set (x,y)', np.shape(X_test), np.shape(y_test)) print('') if score > 0.2: preds = regressor.predict(features) preds = pd.DataFrame(preds) # print('mean of orig. sr. (validation data set)', valid_labels.sr_highres.mean()) print('mean of predicted sr. (validation data set)', preds.values.mean()) plt.figure(figsize=(6, 3)) plt.plot(score_l) plt.ylabel("R2 Score") plt.xlabel("number of ShuffleSplits") plt.show() fig = plt.figure(figsize=(18, 3)) labels['sr_highres'].plot() preds[0].plot() plt.ylim(-4, 4) plt.title('labels vs. predictions') plt.legend(loc='best') plt.show() print('validation set r2 score:', r2_score(labels['sr_highres'], preds[0])) print('validation set mean squared error: {0:.2f}%'.format( mean_squared_error(labels['sr_highres'], preds[0]) * 100)) preds2 = regressor.predict(valid_features) preds2 = pd.DataFrame(preds2) fig = plt.figure(figsize=(18, 3)) valid_labels['sr_highres'].plot() preds2[0].plot() plt.ylim(-4, 4) plt.show() print('test set r2 score:', r2_score(valid_labels['sr_highres'], preds2[0])) print('test set mean squared error: {0:.2f}%'.format( mean_squared_error(valid_labels['sr_highres'], preds2[0]) * 100)) try: importances = regressor.feature_importances_ vs.feature_plot(importances, X_train, y_train) except AttributeError: print('') print('No feature importance available for this learner') print('') print('') if type(e).__name__ == "XGBRegressor": fig = plt.figure(figsize=(15, 5)) plot_importance(regressor) plt.show() return regressor, preds, preds2
def plot_time_split_score(features, labels, valid_features, valid_labels, reglist, n_TSSplits): #def plot_time_split_score(features, labels, reglist, n_TSSplits): tscv = TimeSeriesSplit(n_splits=n_TSSplits) for e in reglist: score_l = [] print e for train_index, test_index in tscv.split(features): X_train, X_test = features.loc[train_index], features.loc[ test_index] y_train, y_test = labels.loc[train_index], labels.loc[test_index] y_train = y_train.values.ravel( ) # change column vector to 1d array to avoid conversion warning @ regressor.fit() regressor = e regressor.fit(X_train, y_train) score = regressor.score(X_test, y_test) score_l.append(score) print('') print "Regressor R2 score on the test set: {:.4f}".format(score) #print('size of the training set (x,y)', np.shape(X_train), np.shape(y_train)) #print('size of the test set (x,y)', np.shape(X_test), np.shape(y_test)) print('') if score > -20.20: preds = regressor.predict(features) preds = pd.DataFrame(preds) #print('mean of orig. sr. (validation data set)', valid_labels.sr_highres.mean()) print('mean of predicted sr. (validation data set)', preds.values.mean()) plt.figure(figsize=(6, 3)) plt.plot(score_l) plt.ylabel("R2 Score") plt.xlabel("number of TimeSeriesSplits") plt.show() fig = plt.figure(figsize=(18, 3)) labels['sr_highres'].plot() preds[0].plot() plt.ylim(-4, 4) plt.title('labels vs. predictions') plt.legend(loc='best') plt.show() print('test set labels mean sr:', labels.mean()) print('test set predicted mean sr:', preds.mean()) preds2 = regressor.predict(valid_features) preds2 = pd.DataFrame(preds2) # fig = plt.figure(figsize=(18, 3)) valid_labels['sr_highres'].plot() preds2[0].plot() plt.ylim(-4, 4) plt.show() print('validation set labels mean sr:', valid_labels.mean()) print('validation set predicted mean sr:', preds2.mean()) try: importances = regressor.feature_importances_ vs.feature_plot(importances, X_train, y_train) except AttributeError: print('') print('No feature importance avalable for this learner') print('') print('') return regressor, preds
# Make predictions using the unoptimized and models predictions = (clf.fit(X_train, y_train.values.ravel())).predict(X_test) best_predictions = best_clf.predict(X_test) # Report the before-and-afterscores print "Unoptimized model\n------" print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)) print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)) print "\nOptimized Model\n------" print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)) print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)) # Import a supervised learning model that has 'feature_importances_' from sklearn.tree import DecisionTreeClassifier # Train the supervised model on the training set model = DecisionTreeClassifier(); model.fit(X_train, y_train) # Extract the feature importances importances = model.feature_importances_ # Plot vs.feature_plot(importances, X_train, y_train)
fbeta_score(y_val, predictions, beta=0.5)) print "\nOptimized Model\n------" print "Final accuracy score on the validation data: {:.4f}".format( accuracy_score(y_val, best_predictions)) print "Final F-score on the validation data: {:.4f}".format( fbeta_score(y_val, best_predictions, beta=0.5)) from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(random_state=0) model.fit(X_train, y_train) importances = model.feature_importances_ importances_AdaBoost = best_clf.feature_importances_ vs.feature_plot(importances, X_train, y_train) vs.feature_plot(importances_AdaBoost, X_train, y_train) from sklearn.base import clone X_train_reduced = X_train[X_train.columns.values[( np.argsort(importances)[::-1])[:5]]] X_val_reduced = X_val[X_val.columns.values[( np.argsort(importances)[::-1])[:5]]] clf_on_reduced = (clone(best_clf)).fit(X_train_reduced, y_train) reduced_predictions = clf_on_reduced.predict(X_val_reduced) print "Final Model trained on full data\n------" print "Accuracy on validation data: {:.4f}".format(