def test_plot_partial_dependence(): # Test partial dependence plot function. clf = GradientBoostingRegressor(n_estimators=10, random_state=1) clf.fit(boston.data, boston.target) grid_resolution = 25 fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)], grid_resolution=grid_resolution, feature_names=boston.feature_names) assert len(axs) == 3 assert all(ax.has_data for ax in axs) # check with str features and array feature names fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN', ('CRIM', 'ZN')], grid_resolution=grid_resolution, feature_names=boston.feature_names) assert len(axs) == 3 assert all(ax.has_data for ax in axs) # check with list feature_names feature_names = boston.feature_names.tolist() fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN', ('CRIM', 'ZN')], grid_resolution=grid_resolution, feature_names=feature_names) assert len(axs) == 3 assert all(ax.has_data for ax in axs)
def test_plot_partial_dependence_multiclass(): # Test partial dependence plot function on multi-class input. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, iris.target) grid_resolution = 25 fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], label=0, grid_resolution=grid_resolution) assert len(axs) == 2 assert all(ax.has_data for ax in axs) # now with symbol labels target = iris.target_names[iris.target] clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, target) grid_resolution = 25 fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], label='setosa', grid_resolution=grid_resolution) assert len(axs) == 2 assert all(ax.has_data for ax in axs) # label not in gbrt.classes_ assert_raises(ValueError, plot_partial_dependence, clf, iris.data, [0, 1], label='foobar', grid_resolution=grid_resolution) # label not provided assert_raises(ValueError, plot_partial_dependence, clf, iris.data, [0, 1], grid_resolution=grid_resolution)
def partial_dependence(self, X_train, fnames): colindex = np.argsort(self.feature_importances_)[::-1] plot_partial_dependence(self.model, X_train, colindex, feature_names = fnames, figsize=(12,10)) plt.title(self.model.__class__.__name__ + " Partial Dependence") plt.tight_layout() plt.show()
def plot_skboost_partial_dependence(model, ax, X_train): plot_partial_dependence(model, X_train, [0, 1], feature_names=X_train.feature_names[0:3], n_jobs=-1, grid_resolution=50) fig.suptitle('Partial Dependence Plot') fig.set_figwidth(15)
def make_plots_of_chemical_features(df_pos_confirmed, df_neg_confirmed): import seaborn as sns sns.set_context('talk', font_scale=1.2) path_to_output = '/home/kimlab1/strokach/working/chemical_interactions/results/14-11-07/' fg, ax = plt.subplots(figsize=(10,6)) df_pos_confirmed['side_effect_similarity'].hist(range=(0,0.6), bins=10, ax=ax) df_neg_confirmed['side_effect_similarity'].hist(range=(0,0.6), bins=10, ax=ax, alpha=0.7) ax.set_xlabel('Side effect similarity') ax.set_ylabel('Number of drug pairs') ax.legend(['Confirmed positive', 'Confirmed negative']) plt.savefig(path_to_output + 'side_effect_similarity_hist.png', bbox_inches='tight', dpi=150) plt.savefig(path_to_output + 'side_effect_similarity.pdf', bbox_inches='tight') plt.savefig(path_to_output + 'side_effect_similarity.eps', bbox_inches='tight') fg, ax = plt.subplots(figsize=(10,6)) df_pos_confirmed['chemical_similarity'].hist(range=(0,1), bins=10, ax=ax) df_neg_confirmed['chemical_similarity'].hist(range=(0,1), bins=10, ax=ax, alpha=0.7) ax.set_xlabel('Chemical similarity') ax.set_ylabel('Number of drug pairs') ax.legend(['Confirmed positive', 'Confirmed negative']) plt.savefig(path_to_output + 'chemical_similarity_hist.png', bbox_inches='tight', dpi=150) plt.savefig(path_to_output + 'chemical_similarity_hist.pdf', bbox_inches='tight') plt.savefig(path_to_output + 'chemical_similarity_hist.eps', bbox_inches='tight') fg, ax = plt.subplots(figsize=(10,6)) df_pos_confirmed['atc_similarity'].hist(range=(0,5), bins=10, ax=ax) df_neg_confirmed['atc_similarity'].hist(range=(0,5), bins=10, ax=ax, alpha=0.7) ax.set_xlabel('ATC code similarity') ax.set_ylabel('Number of drug pairs') ax.legend(['Confirmed positive', 'Confirmed negative']) plt.savefig(path_to_output + 'atc_code_similarity_hist.png', bbox_inches='tight', dpi=150) plt.savefig(path_to_output + 'atc_code_similarity_hist.pdf', bbox_inches='tight') plt.savefig(path_to_output + 'atc_code_similarity_hist.eps', bbox_inches='tight') # Make a feature dependence plot features = [0, 2, 1] pred = ci.Predictor(input_file, path_to_data) data_train, labels_train = get_data_and_labels(pred.predictor_df) fg, ax = plt.subplots(figsize=(8,10), facecolor='white') plot_partial_dependence( clf, data_train, features, n_cols=2, percentiles=(0.01, 0.99), feature_names=['ATC similarity', 'Chemical similarity', 'Side effect similarity'], n_jobs=3, grid_resolution=100, ax=ax) plt.savefig(path_to_output + 'drug_pair_feature_importances.png', bbox_inches='tight', dpi=150) plt.savefig(path_to_output + 'drug_pair_feature_importances.pdf', bbox_inches='tight') plt.savefig(path_to_output + 'drug_pair_feature_importances.eps', bbox_inches='tight')
def plot_dependence(data): ''' Plot the partial dependence ''' # train a gbm x_train = data.copy() y_train = data[RESPONSE_VAR].copy() x_train = x_train.drop(RESPONSE_VAR, axis=1) # train reg = GradientBoostingRegressor(random_state=SEED, n_estimators=500, max_features=1 / 3) reg.fit(x_train, y_train) # determine importances importances = reg.feature_importances_ indices = np.argsort(importances)[::-1] var_names = x_train.columns[indices] # partial dependence features = list(indices[0:4]) names = list(var_names[0:4]) # import code # code.interact(local=locals()) fig, axs = plot_partial_dependence(reg, x_train, features, feature_names=x_train.columns, n_jobs=3, grid_resolution=50, n_cols=2) plt.tight_layout() # tight_layout causes overlap with suptitle plt.savefig('fig/pdp_py_{}.png'.format(dataset), format='png', dpi=200, transparent=False) plt.show()
def main(): cal_housing = fetch_california_housing() # split 80/20 train-test X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=1) names = cal_housing.feature_names print('_' * 80) print("Training GBRT...") clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X_train, y_train) print("done.") print('_' * 80) print('Convenience plot with ``partial_dependence_plots``') print features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle( 'Partial dependence of house value on nonlocation features for the California housing dataset' ) plt.subplots_adjust(top=0.9) print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = plt.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle( 'Partial dependence of house value on median age and average occupancy' ) plt.subplots_adjust(top=0.9) plt.show()
def partial_dependence(df, y): ''' INPUT: X = features y = target variable binary, imbalanced classes OUPUT: X = features oversampled to have balanced target classes y = target variable oversample to have balanced classes Discovers the minority class and then oversamples until eah class makes up 50% of your data. ''' X_train, X_test, y_train, y_test = oversample_train_test(df, y) # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42) feature_engineering = Pipeline([ ('lists', ListSplitter()), ('race', RaceDummies()), ('crime_sentence', CrimeAndSentence()), ('feat_eng', FeatureEngineer()), ('columns', ColumnFilter(prejudice=False)) ]) X = feature_engineering.fit_transform(X_train.copy(), y_train) X_test = feature_engineering.fit_transform(X_test.copy(), y_test) gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75) gbc.fit(X.copy(), y_train) most_imp = np.argsort(gbc.feature_importances_)[-6:] names = list(X_test.columns) feats = list(most_imp) fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names, n_jobs=3, grid_resolution=50)
def make_part_plot(mdoel, features): importance = model.feature_importances_ importance = np.argsort(importance)[::-1] importance = importance[10:25] fig, axs = plot_partial_dependence(model, df, importance, feature_names=features, n_jobs=3, grid_resolution=50) for ax in axs: name = ax.get_xlabel() ax.set_xlabel(name, fontsize=16) ax.set_ylim(-4,4) if name == 'average_gap': ax.set_xlim(0, 6) #ax.set_ylim(-2, 2) if name == 'highest_like': ax.set_ylim(-10, 40) if name == 'highest_topic_percent': ax.set_xlim(0.5, 0.8) if name == 'url_length': ax.set_xlim(0,25) fig.suptitle('Partial Dependence Plot for Selected Features \n Effect on Subscribers Count', fontsize=24) plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle plt.show()
def partial_dependence(df, y): ''' INPUT: X = features y = target variable binary, imbalanced classes OUPUT: X = features oversampled to have balanced target classes y = target variable oversample to have balanced classes Discovers the minority class and then oversamples until eah class makes up 50% of your data. ''' X_train, X_test, y_train, y_test = oversample_train_test(df, y) # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42) feature_engineering = Pipeline([('lists', ListSplitter()), ('race', RaceDummies()), ('crime_sentence', CrimeAndSentence()), ('feat_eng', FeatureEngineer()), ('columns', ColumnFilter(prejudice=False)) ]) X = feature_engineering.fit_transform(X_train.copy(), y_train) X_test = feature_engineering.fit_transform(X_test.copy(), y_test) gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75) gbc.fit(X.copy(), y_train) most_imp = np.argsort(gbc.feature_importances_)[-6:] names = list(X_test.columns) feats = list(most_imp) fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names, n_jobs=3, grid_resolution=50)
def main(): X_train, X_test, y_train, y_test, y_encoder = get_binary_encoded_xy_split(5000) # reduce 1000 X 1024 dimensions to 11 (number of X columns before label binarization in table) X_train_randPCA = RandomizedPCA() X_train_randPCA.fit(X_train) print("pca fit") X_train_reduced = X_train_randPCA.transform(X_train) X_test_reduced = X_train_randPCA.transform(X_test) print("Reduced components") print("Begin classifier") clf = GradientBoostingClassifier(n_estimators=200, max_depth=4, learning_rate=0.1, random_state=1) print(y_train.shape, y_test.shape) print(y_encoder.classes_) print(y_encoder.transform(["Accident"])) print(np.where(y_encoder.classes_ == "Accident")) clf.fit(X_train_reduced, y_train[:, np.where(y_encoder.classes_=="Accident")[0]]) print("Fitted") print("_" * 80) feature_vals = y_encoder.transform(y_encoder.classes_) feature_labels = y_encoder.classes_ print(feature_vals) print(feature_labels) fig, axs = plot_partial_dependence(clf, X_train,[0,1], n_jobs=4, grid_resolution=100) plt.show()
def main(): print 'loading data' boston = datasets.load_boston() #iris = datasets.load_iris() print 'data loaded' X, y = boston.data, boston.target #X, y = iris.data, iris.target print 'X shape:', X.shape print 'y shape:', y.shape scaler = StandardScaler() # I need to fit and transform the data with the scaler.. how do I put # this into pipeline? # initialize PCA to pick 5 components #pca = decomposition.PCA(n_components=4) scaledX = scaler.fit_transform(X) #kf = cross_validation.KFold(scaledX, n_folds=3, shuffle=True) X_train, X_test, y_train, y_test = cross_validation.train_test_split( scaledX, y) # then I will plot partial dependence to see how the features work clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=.1, loss='huber', random_state=1) print 'training', X_train.shape, y_train.shape clf.fit(X_train, y_train) print 'trained' features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=None, n_jobs=3, grid_resolution=50) fig.suptitle( 'Partial dependence of house value on nonlocation features\n' + 'for the California housing dataset') plt.subplots_adjust(top=0.9) plt.show() # then I will PCA and plot partial dependence there # then lasso PCA # then select the parameters # plots are nice but shouldn't I be selecting based on multi-dimensional data? # then generate a list of multi-parameter algorithms # then do a parameter search with gradient boost and a few other multi-parameter algorithms return
def plot_partial_dependence(tmodel, X, col_names, cols_to_plot): assert isinstance(cols_to_plot, list) assert len(cols_to_plot) < 3 inds = [np.where(col_names == col)[0][0] for col in cols_to_plot] if len(inds) == 2: features = (inds[0], inds[1], (inds[0], inds[1])) fig, axs = pdep.plot_partial_dependence(tmodel, X, features, feature_names=col_names) else: features = [inds[0]] fig, axs = pdep.plot_partial_dependence(tmodel, X, features, feature_names=col_names)
def figure_plot(self): fig, _ = plot_partial_dependence( self.__gbc, self.__train_feature, features=self.__all.feature_names, feature_names=self.__all.feature_names, grid_resolution=100, n_cols=3) plt.show()
def plot_partial_dependence(est, X, features, fnames, tag, n_jobs=-1, verbosity=0, directory=None): r"""Display a Partial Dependence Plot. Parameters ---------- est : estimator The scikit-learn estimator for calculating partial dependence. X : numpy array The data on which the estimator was trained. features : list of int Feature numbers of ``X``. fnames : list of str The feature names to plot. tag : str Unique identifier for the plot n_jobs : int, optional The maximum number of parallel jobs. verbosity : int, optional The amount of logging from 0 (minimum) and higher. directory : str Directory where the plot will be stored. Returns ------- None : None. References ---------- http://scikit-learn.org/stable/auto_examples/ensemble/plot_partial_dependence.html#sphx-glr-auto-examples-ensemble-plot-partial-dependence-py """ logger.info("Generating Partial Dependence Plot") # Plot partial dependence fig, axs = plot_partial_dependence(est, X, features, feature_names=fnames, grid_resolution=50, n_jobs=n_jobs, verbose=verbosity) title = "Partial Dependence Plot" fig.suptitle(title) plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle # Save the plot write_plot(model, 'matplotlib', plt, 'partial_dependence', tag, directory)
def main(): # fetch California housing dataset try: cal_housing = fetch_california_housing() except HTTPError: print("Failed downloading california housing data.") return # split 80/20 train-test X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=1) names = cal_housing.feature_names print('_' * 80) print("Training GBRT...") clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X_train, y_train) print("done.") print('_' * 80) print('Convenience plot with ``partial_dependence_plots``') print features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle('Partial dependence of house value on nonlocation features\n' 'for the California housing dataset') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = plt.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median age and ' 'average occupancy') plt.subplots_adjust(top=0.9) plt.show()
def plot_gradiant(clf, X_train, y_train, features): clf.fit(X_train, y_train) fig, axs = plot_partial_dependence(clf, X_train, features.keys(), feature_names=features.values(), grid_resolution=50) fig.suptitle('Partial dependence of house value on nonlocation features\n' 'for the California housing dataset') plt.subplots_adjust(top=0.9) # fig = plt.figure() plt.show()
def plot(self, f_0, f_1=0): print 'feature importance:' for index, value in enumerate(self.clf.feature_importances_): print index,': ',value features = [f_0, f_1, (f_0, f_1)] labels = DataCleaner.categories for i in range(1,8): fig, axs = plot_partial_dependence(self.clf, self.X, features, label=i) pl.show() pl.clf()
def get_GB_cls_metrics(data_fh,info): """ Get the metrics of Gradient Boost classification models :param data_classi_fh: path to file containing Classification training data """ from pylab import figtext try: dpkl=read_pkl(data_fh) except: return False if not 'gs_cv' in dpkl.keys(): return False dXy=dpkl['dXy_final'] ycol=dpkl['ycol'] gs_cv=dpkl['gs_cv'] feat_imp = dpkl['feat_imp'] Xcols=[c for c in dXy.columns.tolist() if c!=ycol] est=gs_cv.best_estimator_ X=dXy.loc[:,Xcols].as_matrix() y=dXy.loc[:,ycol].as_matrix() #partial dep plot_type='partial_dep' plot_fh='%s/data_ml/%s.%s.pdf' % (info.prj_dh,plot_type,basename(data_fh)) if not exists(plot_fh): feats_indi=[s for s in dpkl['feat_imp'].head(6).index.tolist() if not ((') ' in s) and (' (' in s))] features=[Xcols.index(f) for f in feats_indi] feature_names=linebreaker(Xcols) from sklearn.ensemble.partial_dependence import plot_partial_dependence fig, axs = plot_partial_dependence(est, X, features,#[[features[1],features[2]]], feature_names=feature_names, n_jobs=int(info.cores), grid_resolution=50, n_cols=2, line_kw={'color':'r'}, figsize=[7,9]) figtext(0.9,-0.2,'AUC = %.2f' % gs_cv.best_score_,ha='right',color='b') saveplot(plot_fh,form='pdf',tight_layout=False) #relimp plot_type='featimps' plot_fh='%s/data_ml/%s.%s.pdf' % (info.prj_dh,plot_type,basename(data_fh)) if not exists(plot_fh): featst=10 fig=plt.figure(figsize=(3,featst*0.75)) # fig = plt.figure(figsize=(8,featst*0.25))#figsize=(11,5)) ax=plt.subplot(111) feat_imp=feat_imp.sort_values(by='Feature importance',ascending=True) feat_imp.index=linebreaker(feat_imp.index, break_pt=30) feat_imp.tail(featst).plot(kind='barh',ax=ax, color='red') ax.set_xlabel('Feature Importance') ax.legend([]) figtext(0.9,-0.2,'AUC = %.2f' % gs_cv.best_score_,ha='right',color='b') saveplot(plot_fh,form='pdf',tight_layout=False)
def plot_gbm(gbm, model_name, train_X, train_Y, test_X, test_Y, train_fea_name_list): img_dir = './data/' + model_name + '/' train_img_path = img_dir + 'roc_train_' + model_name + '.png' test_img_path = img_dir + 'roc_test_' + model_name + '.png' all_img_path = img_dir + 'roc_all_' + model_name + '.png' importance_img_path = img_dir + 'importance_' + model_name + '.png' pdp_img_path = img_dir + 'pdp_' + model_name + '.png' train_fpr, train_tpr, train_auc, train_accuracy= data_predict(gbm, train_X, train_Y, train_img_path) test_fpr, test_tpr, test_auc, test_accuracy = data_predict(gbm, test_X, test_Y, test_img_path) print 'train data auc %.4f, test data auc %.4f' % (train_auc, test_auc) print 'train data accuracy %.4f, test data accuracy %.4f' % (train_accuracy, test_accuracy) print '**************** feature importance *****************' imp_items = zip(train_fea_name_list, gbm.feature_importances_) sorted_imp_items = sorted(imp_items, key = lambda x:x[1], reverse = True) for name, imp in sorted_imp_items: print '%s: %.4f' % (name, imp) # *********** plot auc ************** plt.figure() plt.plot(train_fpr, train_tpr, label = 'train_auc ' + \ "%.2f" % (train_auc) + ', acc: ' + "%.2f" % (train_accuracy)) plt.plot(test_fpr, test_tpr, label = 'test_auc ' + \ "%.2f" % (test_auc) + ', acc: ' + "%.2f" % (test_accuracy)) plt.legend() plt.savefig(all_img_path) # *************** plot importance *************** feature_importance = gbm.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 plt.figure() plt.subplot(1, 2, 2) plt.barh(pos, feature_importance[sorted_idx], align='center') plt.yticks(pos, np.array(train_fea_name_list)[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.legend() plt.savefig(importance_img_path) # *************** plot partial dependence *************** plt.figure() fig, axs = plot_partial_dependence(gbrt = gbm, X = train_X, features = [ 'PDnnSim', 'Bm25Sim', 'QueryLen', 'DocIdf', \ ('PDnnSim', 'Bm25Sim') ], feature_names = np.array(train_fea_name_list), n_cols = 3, grid_resolution = 100, percentiles = (0.05, 0.95)) plt.legend() plt.savefig(pdp_img_path)
def show_the_pdp(clf, xtrain, feature_li, feature_nam): fig, axs = plot_partial_dependence(clf, xtrain, feature_li, feature_names=feature_nam, grid_resolution=100, n_cols=3) fig.suptitle( "Partial dependence plots for the tick activity using Gradient Boosting method", size=20) fig.subplots_adjust(top=0.8, hspace=0.7, wspace=0.5) plt.show()
def plotPartial(self, nFeat=2): features = self.indices[:nFeat] print "features",features featNames=final_cols print "FeatureNames",featNames fig, axs = plot_partial_dependence(self.gbr, self.X, features, feature_names=featNames) print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = plt.figure() plt.show()
def rph_graph(X, y, columns): my_model = GradientBoostingRegressor() regression_columns = columns my_imputer = SimpleImputer() X_regression = my_imputer.fit_transform(X) my_model.fit(X_regression, y) my_plots = plot_partial_dependence( my_model, features=[0, 1, 2], # column numbers of plots we want to show X=X_regression, # raw predictors data. feature_names=regression_columns, # labels on graphs grid_resolution=10) # number of values to plot on x axis
def main(): print 'loading data' boston = datasets.load_boston() #iris = datasets.load_iris() print 'data loaded' X, y = boston.data, boston.target #X, y = iris.data, iris.target print 'X shape:', X.shape print 'y shape:', y.shape scaler = StandardScaler() # I need to fit and transform the data with the scaler.. how do I put # this into pipeline? # initialize PCA to pick 5 components #pca = decomposition.PCA(n_components=4) scaledX = scaler.fit_transform(X) #kf = cross_validation.KFold(scaledX, n_folds=3, shuffle=True) X_train, X_test, y_train, y_test = cross_validation.train_test_split(scaledX, y) # then I will plot partial dependence to see how the features work clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=.1, loss='huber', random_state=1) print 'training', X_train.shape, y_train.shape clf.fit(X_train, y_train) print 'trained' features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=None, n_jobs=3, grid_resolution=50) fig.suptitle('Partial dependence of house value on nonlocation features\n' + 'for the California housing dataset') plt.subplots_adjust(top=0.9) plt.show() # then I will PCA and plot partial dependence there # then lasso PCA # then select the parameters # plots are nice but shouldn't I be selecting based on multi-dimensional data? # then generate a list of multi-parameter algorithms # then do a parameter search with gradient boost and a few other multi-parameter algorithms return
def multi_case_partial_dependence(df, cases, ests, stdzrs, n_oversamps, c_true, c_pred): y_true_l = [] y_hat_l = [] y_proba_l = [] feats_l = [] fig, ax = plt.subplots(1,1,figsize=(6,4)) for case, est, stdzr, n, c_t, c_p in zip(cases, ests, stdzrs, n_oversamps, c_true, c_pred): data_df = df.copy() # copy to read all columns after dropping print('case: {}'.format(case[0])) # drop other binary and probability column c_drop = [c for c in list(df.columns) if case[1] in c] data_df.drop(c_drop, axis=1, inplace=True) # train test split in time X_train, y_train, X_test, y_test = train_test_split_time(data_df, '2016-06-01', case[0]) names = list(X_train.columns) features = [11, 12, 13, 14, (9, 18)] # plot fig, axs = plot_partial_dependence(est, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle('Partial dependence of features\n' 'for {} model'.format(case[0])) plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (9, 18) pdp, axes = partial_dependence(est, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of features\n' 'for {} model'.format(case[0])) plt.subplots_adjust(top=0.9) plt.show()
def plot_partial_dependencies(self, colnames): feature_importances = self.model.feature_importances_ top10_colindex = np.argsort(feature_importances)[::-1][0:10] #fig, axs = plt.subplots(5,2, figsize=(20,20)) fig, axs = plot_partial_dependence(self.model, self.X, features=top10_colindex, feature_names=colnames, figsize=(20, 20), grid_resolution=100) fig.set_figwidth(20) fig.set_figheight(20) fig.tight_layout() #plt.figure(figsize=(5,5)) plt.show()
def generatePDP(modelObj, featureVector, trainingX, outputFolder, importance=10): #Create Partial Dependenct directory to hold all PD plots pdDir = outputFolder #if the output Partial Dependency Directory doesn't exist, create it if not os.path.exists(os.path.dirname(pdDir)): print "Output Directory: " + pdDir + " Doesn't exist. Creating it now" os.mkdir(os.path.dirname(pdDir)) # to generate feature importance featureImportanceDF = returnFeatureImportance(modelObj, featureVector) #Select only the important features featureImportanceDF = featureImportanceDF[ featureImportanceDF['Relative Importance'] > importance] # to generate PDP, create a list of features featureId = [] featureName = [] for k, feature in enumerate(featureVector.feature_names_, ): featureId.append(k) featureName.append(feature) features = pd.DataFrame([featureId, featureName]).transpose() features.columns = ['FeatureId', 'FeatureName'] #Get the feature id for the important features featureImportanceDF = pd.merge(featureImportanceDF, features, how='left', on='FeatureName') #Generate PD Plots for i in range(featureImportanceDF['FeatureName'].size): feature = [featureImportanceDF['FeatureId'][i]] featName = featureImportanceDF['FeatureName'][i].replace('/', '_') fig, axs = plot_partial_dependence(modelObj, trainingX, feature, featureVector.feature_names_, n_jobs=-1) plt.subplots_adjust(top=0.9) #axs.set_xlabel(featName) #save the plot in the output directory with the feature name as file name fig.savefig(pdDir + featName + "_PD.png") plt.close(fig)
def plot_features(model, feature_names, target, x): """Plot the partial dependence of the feature set.""" plt.figure(figsize=(20, 10)) fig, _ = plot_partial_dependence(model, x, range(len(feature_names)), feature_names=feature_names, n_jobs=-1, n_cols=4, grid_resolution=50) fig.suptitle('Partial dependence of features predicting'.format(target)) plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle plt.subplots_adjust(right=1.5) print('_' * 80) print('Custom plot via ``partial_dependence``') print #fig_size = plt.rcParams["figure.figsize"] plt.rcParams["figure.figsize"] = [12, 8] plt.show()
def make_part_plot(mdoel, features): importance = model.feature_importances_ importance = np.argsort(importance)[::-1] importance = importance[:15] fig, axs = plot_partial_dependence(model, df, importance, feature_names=features, n_jobs=3, grid_resolution=50, label='no traction') for ax in axs: name = ax.get_xlabel() ax.set_xlabel(name, fontsize=16) if ax.get_xlim()[0] > 1500: ax.set_xlim(2004, 2016) fig.suptitle('Partial Dependence Plot for Selected Features \n Effect on Post gaining more likes and comments', fontsize=24) plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle plt.show() return fig, axs
def plot_partial_dependence(X_train, y_train, include_features=None, n_ways=1): """ Plots one-way or two-way partial dependencies (cf. Friedman 2001 or ESL). If include_features is given, only those features will be considered, otherwise all non-categorical features will be included. """ raw_features = list(X_train) features, feature_names = [], [] for i in range(len(raw_features)): if raw_features[i] in FEATURE_NAMES: # everything but categoricals # feature_name indexes match those of full training data column no. feature_names.append(FEATURE_NAMES[raw_features[i]]) if include_features is None or raw_features[i] in include_features: features.append(i) else: # will never be used because categoricals are excluded but we # should keep track of indices nevertheless feature_names.append('Some categorical') assert len(feature_names) == len(raw_features) sys.stderr.write('Plotting %d-way partial depdnence for %d features\n' % (n_ways, len(features))) if n_ways == 1: target_features = features # one-way pdp elif n_ways == 2: target_features = list(combinations(features, 2)) # two-way pdp else: raise Exception('only one-way and two-way partial dependence plots allowed, %d given' % int(n_ways)) reg = train_gbrt(X_train, y_train) fig, axs = partial_dependence.plot_partial_dependence( reg, X_train, target_features, figsize=(22, 12), feature_names=feature_names, n_jobs=3, grid_resolution=50 ) for ax in axs: ax.yaxis.label.set_size(8) ax.grid(True) for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(8) fig.tight_layout()
def plot_partial(clf, X_train, features, feature_ids): for i in feature_ids: _, axs = plot_partial_dependence(clf.named_steps['gbm'], X_train, [i], feature_names=features, n_jobs=14, grid_resolution=30) x = axs[0].lines[0].get_xdata() y = axs[0].lines[0].get_ydata() fig, ax = plt.subplots() fig.set_size_inches(5, 5) plt.subplots_adjust(left = 0.18, right = 0.9, bottom = 0.15, top = 0.9) ax.plot(x, y, '-', color = 'black', linewidth = 1) #ax.set_ylim(-1, 0.5) ax.set_ylabel('Partial Dependence', fontsize = 13) ax.set_xlabel(features[i], fontsize = 14) plt.savefig("partial_dependence_" + features[i] + ".png")
def plot_2d(self, feature_2d, top=0.9, n_jobs=3, grid_resolution=50, figsize=(8, 9), subtitle=""): fig, axs = plot_partial_dependence(gbrt=self.model, X=self.feature_df, features=feature_2d, feature_names=self.feature_list, n_jobs=n_jobs, grid_resolution=grid_resolution, figsize=figsize) fig.suptitle(subtitle) plt.subplots_adjust(top=top, left=0.16, bottom=0.07, right=0.81, wspace=0.98, hspace=0.63) plt.savefig(os.path.join(self.output_path, self.fig_file)) plt.close()
def partial_dependence(df, y): X_train, X_test, y_train, y_test = oversample_train_test(df, y) # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42) feature_engineering = Pipeline([ ('lists', ListSplitter()), ('race', RaceDummies()), ('crime_sentence', CrimeAndSentence()), ('feat_eng', FeatureEngineer()), ('columns', ColumnFilter(prejudice=False)) ]) X = feature_engineering.fit_transform(X_train.copy(), y_train) X_test = feature_engineering.fit_transform(X_test.copy(), y_test) gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75) gbc.fit(X.copy(), y_train) most_imp = np.argsort(gbc.feature_importances_)[-6:] names = list(X_test.columns) feats = list(most_imp) fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names, n_jobs=3, grid_resolution=50)
mean_mse = mse.mean() mean_r2 = r2.mean() params = estimator.get_params() name = estimator.__class__.__name__ print '%s Train CV | MSE: %.3f | R2: %.3f' % (name, mean_mse, mean_r2) return mean_mse, mean_r2 cross_val(gd_best, train_x, np.array(train_y)) cross_val(rf_best, train_x, np.array(train_y)) cross_val(gd_best, test_x, test_y) cross_val(rf_best, test_x, test_y) col_names = X.columns # sort importances indices = np.argsort(gd_best.feature_importances_) # plot as bar chart figure = plt.figure(figsize=(10,7)) plt.barh(np.arange(len(col_names)),gd_best.feature_importances_[indices], align='center', alpha=.5) plt.yticks(np.arange(len(col_names)), np.array(col_names)[indices], fontsize=14) plt.xticks(fontsize=14) _ = plt.xlabel('Relative importance', fontsize=18) fig, axs = plot_partial_dependence(gd_best, train_x, range(X.shape[1]) , feature_names=col_names, figsize=(15, 10)) fig.tight_layout()
from sklearn.impute import SimpleImputer from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble.partial_dependence import partial_dependence from sklearn.ensemble.partial_dependence import plot_partial_dependence def get_some_data(): cols_to_use = ['LotArea', 'YearBuilt', 'GrLivArea'] data = pd.read_csv('train.csv') y = data.SalePrice X = data[cols_to_use] my_imputer = SimpleImputer() imputed_X = my_imputer.fit_transform(X) return imputed_X, y # get_some_data is defined in hidden cell above. X, y = get_some_data() # scikit-learn originally implemented partial dependence plots only for Gradient Boosting models # this was due to an implementation detail, and a future release will support all model types. my_model = GradientBoostingRegressor() # fit the model as usual my_model.fit(X, y) # Here we make the plot my_plots = plot_partial_dependence( my_model, features=[0, 2], # column numbers of plots we want to show X=X, # raw predictors data. feature_names=['LotArea', 'YearBuilt', 'GrLivArea'], # labels on graphs grid_resolution=10) # number of values to plot on x axis
np_y = train.as_matrix(columns=['Sales']) clf = ensemble.GradientBoostingRegressor(n_estimators=1000, max_depth=5, max_features=5, min_samples_split=6, min_samples_leaf=6, learning_rate=0.1, loss='ls') cross_validate(np_x, np_y, np_weekInd, 10, estimator=clf) clf.feature_importances_ from sklearn.ensemble.partial_dependence import plot_partial_dependence features = [0,1,(0, 1)] plot_partial_dependence(clf, np_x, features) test = read_test_df() test.loc[test['Open'].isnull(), 'Open'] = 1 test['Promo2'] = 0 test['StoreType'] = 0 test['Assortment'] = 0 test['CompetitionDistance'] = 0 test['HasCompetitor'] = -1 test['CompetingMonths'] = 0
confusion_matrix(res_gb, test_outcome) #precision_recall_curve(test_outcome, res_gb) ## empirical misclassification error: 1 - (np.diag(confusion_matrix(res_gb, test_outcome)).sum())/(confusion_matrix(res_gb, test_outcome).sum()) # 0.22595596755504055 ## misclassification error per class: drf = np.diag(confusion_matrix(res_rf, test_outcome)) dgb = np.diag(confusion_matrix(res_gb, test_outcome)) crf = confusion_matrix(res_rf, test_outcome).sum(axis=0) cgb = confusion_matrix(res_gb, test_outcome).sum(axis=0) errors_rf = np.zeros(7) errors_gb = np.zeros(7) for i in range(len(fault_types)): errors_rf[i] = 1 - (drf[i])/(crf[i]) errors_gb[i] = 1 - (dgb[i])/(cgb[i]) #partial dependence plots features = [1, 10, 14, (10, 14)] fig, axs = plot_partial_dependence(gbfit, trainset, features,label=gbfit.classes_[0],n_jobs=2, grid_resolution=50) fig.suptitle('Partial dependence of X_Maximum, Length_of_Conveyer and Edges_Index for Pastry faults') ####### comments ####### # The better mse of the GBM is very likely due to a better recognition # of the Z_scratch fault. A little contribution due also to Bumps. # The most mistaken fault types are bumpiness and other faults. # In case of other faults, this was quite expected: firstly for the high # numerosity of the class, compared to the others. Secondly, because the class # 'other' is too broad and not well defined. Therefore it is likely that # shares many common features with the remaining fault types. Plot only # this group to see if there are better clusters.
'learning_rate': 0.01, 'loss': 'ls' } gbr = GradientBoostingRegressor(**params) gbr.fit(X_train, y_train) pd.crosstab(y_test, gbr.predict(X_test).round(), rownames=['Actual'], colnames=['Predicted']) pd.DataFrame({ 'Variable': X_test.columns, 'Importance': gbr.feature_importances_ }).sort_values('Importance', ascending=False) fig, axs = plot_partial_dependence( gbr, X=X_test, features=['Parhelion Patrol', 'Rubblebelt Boar', 'Hammer Dropper'], feature_names=feature_list, n_jobs=1, grid_resolution=10) allpd = {} for i in range(len(feature_list) - 1): key, values = partial_dependence(gbr, target_variables=i, X=X_test) allpd.update(dict(zip([feature_list[i]], key.tolist()))) df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in allpd.items()]))
# classify 10% of others # Create partial dependence plot on most important features for gbm importances = pd.DataFrame(gbm_grid.best_estimator_.feature_importances_, index = df.columns, columns = ['importance']) importances.sort(columns=['importance'], ascending=False, inplace = True) print importances from sklearn.ensemble.partial_dependence import plot_partial_dependence features = [i for i,j in enumerate(df.columns.tolist()) if j in importances.importance[0:3].index.tolist()] fix, axs = plot_partial_dependence(gbm_grid.best_estimator_, df, features, feature_names = df.columns) ################################ # Read in the testing set and prep it ################################ ## Read in the training dataset df_test = pd.read_csv("C:\\Users\\garauste\\Dropbox\\General Assembly\\Project\\Titanic\\Titanic Data\\test.csv") df_test.head() df_submit = df_test ## Creating a function to pull out the titles of the Passengers def find_between( s, first, last ):
names = cal_housing.feature_names print('_' * 80) print("Training GBRT...") clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X_train, y_train) print("done.") print('_' * 80) print('Convenience plot with ``partial_dependence_plots``') print features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle('Partial dependence of house value on nonlocation features\n' 'for the California housing dataset') pl.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = pl.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig)
params = [100,500,1000,1500,650,700,750] for max_leaf_nodes in params: mea = getmea(max_leaf_nodes,train_x,val_x,train_y,val_y) #test_scores.append(np.mean(mea))) print("Max_leaf_nodes: %d ,mea: %d" %(max_leaf_nodes,mea)) plt.plot(params, test_scores) plt.title("max_leaf_nodes Error" + str(params)); plt.show() ''' my_model = GradientBoostingRegressor(n_estimators=10) my_model.fit(X, y) my_plots = plot_partial_dependence(my_model, features=[0,1,2], X=X, feature_names=cols_to_use, grid_resolution=20) plt.show() #melbourne_predictors = ['Rooms','Bathroom','Landsize','BuildingArea','YearBuilt','Lattitude','Longtitude'] #X = melbourne_data[melbourne_predictors] # split data into train and validation # how to know test_size and random_state? #train_x,val_x,train_y,val_y = train_test_split(X,y,test_size=0.25,random_state = 0) # find max_leaf_nodes, then get 400 ''' def getmea(max_leaf_nodes,mea_train_x,mea_test_x,mea_train_y,mea_test_y): model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes,random_state = 0) model.fit(mea_train_x,mea_train_y)
from sklearn.ensemble.partial_dependence import plot_partial_dependence from sklearn.ensemble.partial_dependence import partial_dependence import numpy as np import pandas as pd import matplotlib.pyplot as plt selected_features = ['body_length','fb_published','gts','has_analytics','has_header', 'has_logo', 'name_length','num_order','num_payouts','org_facebook','org_twitter','sale_duration','sale_duration2', 'show_map','user_age','min_price','max_price', 'mean_price', 'total_revenue', 'total_tix_sold', 'total_tix_offered','A','C','E','G','M','N','U','num_caps_freq','email_.com','email_.gov', 'email_.org','email_.other','delivery_method_0.0','delivery_method_1.0','delivery_method_3.0','delivery_method_nan', 'state_GREATER_LONDON','state_FL','state_LONDON','state_GT_LON','state_DE','state_BIRMINGHAM','state_PA', 'state_NV','state_NH','state_GA','state_ENGLAND','country_US','country_IE','country_FR','country_CA', 'country_GB','country_AU','country_ES','country_NL','country_DE','country_VN','country_NZ','country_PK', 'country_MA','country_A1','country_other','previous_payout'] selected_features = np.array(selected_features) features = [np.where(selected_features == 'total_tix_sold')[0][0], np.where(selected_features == 'mean_price')[0][0]] fig, axs = plot_partial_dependence(gb4000_clf, X_train, features, feature_names = selected_features, n_jobs = -1, grid_resolution = 100) fig.suptitle('Partial dependence of fraud detection features') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle #fig.tight_layout() plt.show()
# from sklearn.ensemble.partial_dependence import partial_dependence # from sklearn.ensemble import GradientBoostingClassifier # pdp, axes = partial_dependence(clf, [0], X=X) # clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X, y) # print pdp # print axes from sklearn.datasets import make_hastie_10_2 from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble.partial_dependence import plot_partial_dependence X, y = make_hastie_10_2(random_state=0) clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X, y) features = [0, 1, (0, 1)] fig, axs = plot_partial_dependence(clf, X, features)
X_test, y_test = X[offset:], y[offset:] params = {'n_estimators': 500, 'learning_rate': 0.08, 'max_depth': 3, 'min_samples_leaf': 1} clf = GradientBoostingRegressor(**params) print 'Training...' clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) print("RMSE: %.4f" % np.sqrt(mse)) clf_full_data = joblib.load('model/model.pkl') print 'Generating graphs - partial dependance...' for idx, x in enumerate(features): fig, axs = partial_dependence.plot_partial_dependence(clf_full_data, X, [features[idx]], feature_names=list(features)) fig.savefig('graphs/_%s.png' %x.lower().replace(' ', '_')) ############################################################################### # Plot training deviance # compute test set deviance test_score = np.zeros((params['n_estimators'],), dtype=np.int64) for i, y_pred in enumerate(clf.staged_predict(X_test)): test_score[i] = clf.loss_(y_test, y_pred) deviance_plot = plt deviance_plot.figure(figsize=(12, 6)) deviance_plot.title('Deviance') deviance_plot.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
df_for_reg.fillna(0, inplace=True) X = df_for_reg[num_predictor] y = df_for_reg['domestic_gross'] X_mat = sm.add_constant(X) linmodel = sm.OLS(y, X_mat).fit() print linmodel.summary() plt.scatter(y, linmodel.resid) plt.scatter(y, linmodel.fittedvalues) from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble.partial_dependence import plot_partial_dependence from sklearn.ensemble.partial_dependence import partial_dependence clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,learning_rate=0.1) clf.fit(X = df_for_reg[num_predictor], y = df_for_reg['domestic_gross']) fig, axs = plot_partial_dependence(clf, df_for_reg[num_predictor], [0], feature_names='wide_release_log', grid_resolution=50) for item in num_predictor: print item plt.scatter(df_for_reg[item] , df_for_reg['domestic_gross']) plt.show()
from gradientboost import GradientBoost from sklearn.ensemble.partial_dependence import plot_partial_dependence, partial_dependence import pandas as pd import pickle import matplotlib.pyplot as plt if __name__ == '__main__': with open('modelg2.p', 'rb') as f: model = pickle.load(f) feature_names = list(model.data.columns) feature_names.remove('fraud') <<<<<<< HEAD features = ['event_delay', 'name_length', 'user_created', 'venue_address', 'avg_price', 'num_payouts'] fig, axs = plot_partial_dependence(model.m, model.X_train, features, feature_names = feature_names) fig.set_title('Partial Dependency Plots') ======= features = [[feature_names]] fig, axs = plot_partial_dependence(model.m, model.X_train, feature_names, feature_names = feature_names) >>>>>>> d69dc69fbeb04cd4d5fe3fbc062c341f36d223c9
test = pd.read_csv('pickle_cellar/test_data.csv') np_test_x = test.as_matrix() test_y_hat = clf.predict(np_test_x) ind = range(1, test_y_hat.shape[0] + 1) result = zip(ind, test_y_hat) submission = pd.DataFrame(result, columns=["Id","Sales"]) submission.to_csv('submissions/gb_storeid_dow_model.csv', index=False) from sklearn.ensemble.partial_dependence import plot_partial_dependence from sklearn.ensemble.partial_dependence import partial_dependence for i in range(0, 16): for t in range(0, 16): if i != t: fig, axs = plot_partial_dependence(clf, np_x, [(i,t)], feature_names=train_x.columns, n_jobs=-1, grid_resolution=20) features = [3, 14, (3, 14)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=train_x.columns, n_jobs=-1, grid_resolution=20) from itertools import combinations aa = combinations(range(0, 16), 2) for i,t in aa: fig, axs = plot_partial_dependence(clf, np_x, [(i,t)], feature_names=train_x.columns, n_jobs=-1, grid_resolution=20) pred = clf.predict(X_test)
# Using gradient boost regressor to plot partial dependence plot import pandas as pd from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble.partial_dependence import plot_partial_dependence from sklearn.impute import SimpleImputer # The PDP will show us the relationship between the target and its features features = ['Distance', 'Landsize', 'BuildingArea'] data = pd.read_csv('../data/melb_data.csv') y = data.Price X = data[features] imputer = SimpleImputer() X = imputer.fit_transform(X) model = GradientBoostingRegressor() model.fit(X, y) fig, plots = plot_partial_dependence(model, features=[0, 1, 2], X=X, feature_names=features, grid_resolution=40) fig.show() input('Press enter to continue...')
print predictors[i], temp_pd[0].shape if temp_pd[0].shape[1] == gr: temp_output = numpy.empty(gr,dtype=[('model', '|S255'), ('n_split', 'i1'), ('lr','f4'),('n_tree','i4'),('pred','|S255'),('pdp_x','f4'),('pdp_y','f4')]) temp_output['model'] = shelf_file temp_output['n_split'] = tree_depth temp_output['lr'] = learning_rate temp_output['n_tree'] = n_trees temp_output['pred'] = predictors[i] temp_output['pdp_x'] = temp_pd[0] temp_output['pdp_y'] = numpy.array(temp_pd[1]) numpy.savetxt(fname=pd_table,X=temp_output,delimiter=',',fmt=['%s','%d','%0.4f','%d','%s','%0.4f','%0.4f']) #fig, axs = plot_partial_dependence(clf, X_train, [i], grid_resolution=30) #, feature_names=predictors[i]) #, n_jobs=32, grid_resolution=50) fig, axs = plot_partial_dependence(clf, X_temp, [i], grid_resolution=gr) #, feature_names=predictors[i]) #, n_jobs=32, grid_resolution=50) # set x and y axis limits if prd=="eco_l1": xmin = numpy.nanmin(X_temp[:,i]) xmax = numpy.nanmax(X_temp[:,i]) else: xmin = numpy.nanpercentile(a=X_temp[:,i], q=2.5) xmax = numpy.nanpercentile(a=X_temp[:,i], q=97.5) plt.xlim( (xmin, xmax) ) print predictors[i], xmin, numpy.nanmean(X_temp[:,i]), xmax plt.xlabel(predictors[i]) plt.savefig( pd_dir + predictors[i] + ".png", dpi=100) plt.close(fig)
def gradientBoosting(): num_estimadores = 350 clf = ensemble.GradientBoostingRegressor(n_estimators=num_estimadores, max_depth=2, learning_rate=0.1, loss='ls', subsample=0.5) importancias = [0,0,0,0,0,0,0,0,0,0,0,0,0] mae, mse, mr2, cont = 0, 0, 0, 0 test_score = np.zeros((num_estimadores,), dtype=np.float64) train_score = np.zeros((num_estimadores,), dtype=np.float64) mseVector = [0] kf = KFold(len(boston_Y), n_folds=10, indices=True) for train, test in kf: trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test] clf.fit(trainX, trainY) pred = clf.predict(testX) maeGradient = metrics.mean_absolute_error(testY, pred) mseGradient = metrics.mean_squared_error(testY, pred) r2 = metrics.r2_score(testY, pred) mae = mae + maeGradient mse = mse + mseGradient mr2 = mr2 + r2 mseVector.append(mseGradient) cont = cont + 1 for i, y_pred in enumerate(clf.staged_decision_function(testX)): test_score[i] = test_score[i] + clf.loss_(testY, y_pred) for i in range(num_estimadores): train_score[i] = clf.train_score_[i] + train_score[i] feature_importance = clf.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) for i in range(13): importancias[i] = importancias[i] + feature_importance[i] print str("Iteracción ")+str(cont)+str(" de la validacion cruzada") print str("\tError medio absoluto: ")+str(maeGradient) print str("\tError medio cuadrado: ")+str(mseGradient) print str("\tr2: ")+str(r2) #Dibuja los puntos que predice sobre los puntos verdaderos pl.plot(testY, testY, label='Valor verdadero') pl.plot(testY, pred, 'ro', label='Prediccion Gradient') pl.legend(bbox_to_anchor=(1.05, 1), borderaxespad=0., prop = FontProperties(size='smaller')) pl.show() print mseVector mae = mae/10 mse = mse/10 mr2 = mr2/10 print str("Error medio absoluto: ")+str(mae)+str("\tError medio cuadratico: ")+str(mse)+str("\tR2: ")+str(mr2) for i in range(13): importancias[i] = importancias[i]/10 sorted_idx = np.argsort(importancias) pos = np.arange(sorted_idx.shape[0]) + .5 importancias = np.reshape(importancias, (len(importancias), -1)) boston = datasets.load_boston() pl.barh(pos, importancias[sorted_idx], align='center') pl.yticks(pos, boston.feature_names[sorted_idx]) pl.xlabel('Importancia relativa') pl.show() for i in range(num_estimadores): test_score[i] = test_score[i]/10 train_score[i] = train_score[i]/10 pl.figure(figsize=(12, 6)) pl.subplot(1, 1, 1) pl.title('Desviacion') pl.plot(np.arange(num_estimadores) + 1, train_score, 'b-', label='Error en el conjunto de Training') pl.plot(np.arange(num_estimadores) + 1, test_score, 'r-', label='Error en el conjunto de Test') pl.legend(loc='upper right') pl.xlabel('Iteracciones del Boosting (numero de arboles)') pl.ylabel('Desviacion') pl.show() print len(mseVector) print len(np.arange(10)) pl.subplot(1, 1, 1) pl.plot(np.arange(11), mseVector, 'b-') pl.legend(loc='upper right') pl.xlabel('Iteraccion de la validacion cruzada') pl.ylabel('Erro Medio Cuadratico') pl.show() fig, axs = plot_partial_dependence(clf, trainX,[0,1,2,3,4,5,6,7,8,9,10,11,12]) fig.suptitle('Dependencia parcial del valor de las casas') pl.subplots_adjust(top=0.9) pl.show()
expected = y[size:] predicted = regressor.predict(x_norm[size:]) pearson = pearsonr(expected,predicted)[0] # measures the correlation between what was predicted and what actually happened print("Pearson coefficient: %s" % str(pearson)) # (Mean Squared Error) is a measure of the amplitude of the error print("MSE : %s" % np.sqrt(mean_squared_error(expected, predicted))) ### feature importance feature_importance = regressor.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) ## relative importance # plot relative importance plt.barh(np.arange(len(x_norm.columns)), regressor.feature_importances_[sorted_idx]) plt.yticks(np.arange(len(x_norm.columns)) + 0.25, np.array(x_norm.columns)[sorted_idx]) _ = plt.xlabel('Importance relative') plt.savefig("relative_importance.png", bbox_inches='tight') # shell info print("most important features:") i=1 for f,w in zip(x_norm.columns[sorted_idx], feature_importance[sorted_idx]): print("%d) %s : %d" % (i, f, w)) i+=1 # plot partial dependence / feature features = [f] fig, axs = plot_partial_dependence(regressor, x_norm, features, feature_names=x_norm.columns, figsize=(8, 6)) name = f + "_partial_dependence.png" plt.savefig(name, bbox_inches='tight')
# Using the gradient boosting regressor from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble.partial_dependence import plot_partial_dependence regr = DecisionTreeRegressor(max_depth=4) clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X, y) clf.feature_importances_ fig, axs = plot_partial_dependence(clf, X, [0, 1, (1, 2), (2, 3)], feature_names=['A', 'B'], n_jobs=3, grid_resolution=50) # Doing cross validation from sklearn.model_selection import ShuffleSplit ss = ShuffleSplit(n_splits=5, test_size=0.2, random_state=2) list_train_test = [(X_train[train_index], X_train[test_index], y_train[train_index], y_train[test_index]) for (train_index, test_index) in ss.split(X_train)] for X_train, X_test, y_train, y_test in list_train_test: linEx = [] clf.fit(X_train, y_train) print(linEx(y_test, clf.predict(X_test))) # Export graphiz
plt.ylabel('True Positive Rate (Sensitivity)') ## what does this tell us for this sample? #this tells us that random forest is the best model ## create partial dependence plot on most important features for gbm. importances = pandas.DataFrame(gbm_grid.best_estimator_.feature_importances_, index = explanatory_df.columns, columns =['importance']) importances.sort(columns = ['importance'], ascending = False, inplace = True) print importances from sklearn.ensemble.partial_dependence import plot_partial_dependence features = [i for i, j in enumerate(explanatory_df.columns.tolist()) if j in importances.importance[0:3].index.tolist()] fig, axs = plot_partial_dependence(gbm_grid.best_estimator_, explanatory_df, features, feature_names = explanatory_df.columns) # importance #totalruns 0.156319 #shutouts 0.085167 #errors 0.077250 #teamID_Nothing 0.071030 #totalRBI 0.064376 #earnedruns 0.061927 #stolenbases 0.049815 #atbats 0.045446 #totalhomeruns 0.044935 #totalgames 0.042369 #timewithouts 0.036490 #doubleplays 0.036110 #totalhits 0.036078
# look at partial dependence plot on most important features for gbm importances = pandas.DataFrame(gbm_grid.best_estimator_.feature_importances_, index=explanatory_df.columns, columns=['importance']) importances.sort(columns=['importance'], ascending=False, inplace=True) print importances # does not necessarily say whether it is a positive or negative importance from sklearn.ensemble.partial_dependence import plot_partial_dependence features = [ i for i, j in enumerate(explanatory_df.columns.tolist()) if j in importances.importance[0:3].index.tolist() ] # match feature importance for the first 3 importances # i is index in list where the name occured - finds the feature # j is the feature name fig, axs = plot_partial_dependence(gbm_grid.best_estimator_, explanatory_df, features, feature_names=explanatory_df.columns) # compare the mean ROC AUC print "Neural Networks Mean ROC AUC %f" % roc_scores_nn.mean() print "Boosting Tree Mean ROC AUC %f" % roc_scores_gbm.mean() print "Random Forest Mean ROC AUC %f" % roc_scores_rf.mean() print "Decision Tree Mean ROC AUC %f" % roc_score_tree.mean()
def dependence(self, forest, train, feature_set): print "******************this is the output of dependences of features" fig, axs = plot_partial_dependence(forest, train, features=feature_set ) plt.show()