lr1.fit(X_train,y_train) y_lr1 = lr1.predict(X_test) params lr1.score(X_train,y_train) lr1.score(X_test,y_test) generate_model_report(y_test,y_lr1) from sklearn.metrics import classification_report print(classification_report(y_test, y_lr1, labels=['Low_Damage', 'Medium_Damage', 'High_Damage'])) from yellowbrick.classifier.rocauc import roc_auc roc_auc(lr1, X_train, y_train, X_test=X_test, y_test=y_test, classes=["Low_damage","Medium_damage","High_damage"]) from sklearn.linear_model import LogisticRegression lr = LogisticRegression(random_state = 0, multi_class = "multinomial", solver="newton-cg",C=1000) lr.get_params() lr.fit(X_train,y_train) y_lr = lr.predict(X_test) generate_model_report(y_test, y_lr) y_prob = lr.predict_proba(X_test) y_prob lr.score(X_train,y_train)
def model_report(model, X_train, y_train, X_test, y_test, cmap=['Reds', 'Greens'], normalize='true', figsize=(16, 6), show_train_report=False, unfitted_model=True): """ Report of model performance using train-test split dataset. Shows train and test score, Confusion Matrix and, ROC Curve of performane of test data. Intended to work ONLY on model where target has properly encoded binomial class value. Parameters: =========== model = object, scikit-learn model object; no default. X_train = pandas.DataFrame, predictor variable training data split; no default, y_train = pandas.DataFrame, target variable training data split; no default, X_test = pandas.DataFrame, predictor variable test data split; no default, y_test = pandas.DataFrame, target variable test data split; no default, cmap = list of str, colormap of Confusion Matrix; default: ['Reds','Greens'], cmap of train and test data normalize = str, normalize count of Confusion Matrix; default: 'true', - `true` to normalize counts. - `false` to show raw scounts. figsize = tuple ``(lenght, height)``, figsize of output; default: (16, 6), show_train_report = boolean; default: False, - True, to show report. - False, to turn off report. unfitted_model = bool; default: True, - if True, fits model to train data and generates report. - if False, does not fits model and generates report. Use False for previously fitted model. ---version 0.9.15--- """ def str_model_(model): """Helper function to get model class display statement, this text conversion breaks code if performed in ``model_report`` function's local space. This function is to isolate from the previous function's local space.""" str_model = str(model.__class__).split('.')[-1][:-2] display( HTML( f"""<strong>Report of {str_model} type model using train-test split dataset.</strong>""" )) str_model_(model) X_train = X_train.copy() y_train = y_train.copy() if unfitted_model: model.fit(X_train, y_train) print(f"{'*'*90}") train = model.score(X_train, y_train) test = model.score(X_test, y_test) print(f"""Train accuracy score: {train.round(4)}""") print(f"""Test accuracy score: {test.round(4)}""") if abs(train - test) <= .05: print( f" No over or underfitting detected, diffrence of scores did not cross 5% thresh hold." ) elif (train - test) > .05: print( f" Possible Overfitting, diffrence of scores {round(abs(train-test)*100,2)}% crossed 5% thresh hold." ) elif (train - test) < -.05: print( f" Possible Underfitting, diffrence of scores {round(abs(train-test)*100,2)}% crossed 5% thresh hold." ) print(f"{'*'*90}") print("") print(f"{'*'*60}") if show_train_report: print(f"""Classification report on train data of: {model}""") print(f"{'-'*60}") print(metrics.classification_report(y_train, model.predict(X_train))) print(f"{'*'*60}") fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=figsize) metrics.plot_confusion_matrix(model, X_train, y_train, cmap=cmap[0], normalize=normalize, ax=ax1) ax1.title.set_text('Confusion Matrix') _ = roc_auc(model, X_train, y_train, classes=None, is_fitted=True, show=False, ax=ax2) # ax[1].plot([0, 1], [0, 1], ls='-.', color='white') ax2.grid() ax2.title.set_text('ROC Curve') plt.tight_layout() plt.show() print(f"{'='*170}") print(f"{'*'*60}") print(f"""Classification report on test data of: {model}""") print(f"{'-'*60}") print(metrics.classification_report(y_test, model.predict(X_test))) print(f"{'*'*60}") fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=figsize) metrics.plot_confusion_matrix(model, X_test, y_test, cmap=cmap[1], normalize=normalize, ax=ax1) ax1.title.set_text('Confusion Matrix') _ = roc_auc(model, X_test, y_test, classes=None, is_fitted=True, show=False, ax=ax2) ax2.grid() ax2.title.set_text('ROC Curve') plt.tight_layout() plt.show()
def score_model_outcome(X_train, y_train, X_test, y_test, model, **kwargs): """ A function that returns the different metrics of accuracy, confusion matrix and other model reports depending on the type of model that is asked. This function is for prognosis Parameters ---------- X_train: matrix of training features y_train: vector of training labels X_test: matrix of test features y_test: vector of test labels Returns ------- - Accuracy, F1 score and ROC_AUC for the train and test set - Confusion matrix - ClassificationReport - PrecisionRecallCurve - ClassPredictionError """ # Train the model model.fit(X_train, y_train, **kwargs) # Predict on the train set prediction_train = model.predict(X_train) # Compute metrics for the train set accuracy_train = accuracy_score(y_train, prediction_train) # False Positive Rate, True Positive Rate, Threshold fpr_train, tpr_train, thresholds_train = roc_curve(y_train, prediction_train) auc_train = auc(fpr_train, tpr_train) f1_score_train = f1_score(y_train, prediction_train) # Predict on the test set prediction_test = model.predict(X_test) accuracy_test = accuracy_score(y_test, prediction_test) fpr_test, tpr_test, thresholds_test = roc_curve(y_test, prediction_test) auc_test = auc(fpr_test, tpr_test) f1_score_test = f1_score(y_test, prediction_test) print("{}:".format(model.__class__.__name__)) # Compute and return F1 (harmonic mean of precision and recall) print( "On training we get an Accuracy {}, an AUC {} and F1 score {} ".format( accuracy_train, auc_train, f1_score_train)) print("For test we get an Accuracy {}, an AUC {} and F1 score {}".format( accuracy_test, auc_test, f1_score_test)) fig, axes = plt.subplots(3, 2, figsize=(20, 20)) visualgrid = [ ConfusionMatrix(model, ax=axes[0][0], classes=['Death', 'Survival'], cmap="YlGnBu"), ClassificationReport( model, ax=axes[0][1], classes=['Death', 'Survival'], cmap="YlGn", ), PrecisionRecallCurve(model, ax=axes[1][0]), ClassPredictionError(model, classes=['Death', 'Survival'], ax=axes[1][1]), ] for viz in visualgrid: viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.finalize() try: roc_auc(model, X_train, y_train, X_test=X_test, y_test=y_test, classes=['Death', 'Survival'], ax=axes[2][0]) except: print('Can plot ROC curve for this model') try: viz = FeatureImportances(model, ax=axes[2][1], stack=True, relative=False) viz.fit(X_train, y_train) viz.score(X_test, y_test) viz.finalize() except: print('Don\'t have feature importance') plt.show() print('\n')
def visualize_features(classes, problem_type, curdir, default_features, balance_data, test_size): # make features into label encoder here features, feature_labels, class_labels = get_features( classes, problem_type, default_features, balance_data) # now preprocess features for all the other plots os.chdir(curdir) le = preprocessing.LabelEncoder() le.fit(class_labels) tclass_labels = le.transform(class_labels) # process features to help with clustering se = preprocessing.StandardScaler() t_features = se.fit_transform(features) X_train, X_test, y_train, y_test = train_test_split(features, tclass_labels, test_size=test_size, random_state=42) # print(len(features)) # print(len(feature_labels)) # print(len(class_labels)) # print(class_labels) # GET TRAINING DATA DURING MODELING PROCESS ################################## # get filename # csvfile='' # print(classes) # for i in range(len(classes)): # csvfile=csvfile+classes[i]+'_' # get training and testing data for later # try: # print('loading training files...') # X_train=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'train.csv') # y_train=X_train['class_'] # X_train.drop(['class_'], axis=1) # X_test=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'test.csv') # y_test=X_test['class_'] # X_test.drop(['class_'], axis=1) # y_train=le.inverse_transform(y_train) # y_test=le.inverse_transform(y_test) # except: # print('error loading in training files, making new test data') # Visualize each class (quick plot) ################################## visualization_dir = 'visualization_session' try: os.mkdir(visualization_dir) os.chdir(visualization_dir) except: shutil.rmtree(visualization_dir) os.mkdir(visualization_dir) os.chdir(visualization_dir) objects = tuple(set(class_labels)) y_pos = np.arange(len(objects)) performance = list() for i in range(len(objects)): performance.append(class_labels.count(objects[i])) plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.xticks(rotation=90) plt.title('Counts per class') plt.ylabel('Count') plt.xlabel('Class') plt.tight_layout() plt.savefig('classes.png') plt.close() # set current directory curdir = os.getcwd() # ################################## # # CLUSTERING!!! # ################################## ################################## # Manifold type options ################################## ''' "lle" Locally Linear Embedding (LLE) uses many local linear decompositions to preserve globally non-linear structures. "ltsa" LTSA LLE: local tangent space alignment is similar to LLE in that it uses locality to preserve neighborhood distances. "hessian" Hessian LLE an LLE regularization method that applies a hessian-based quadratic form at each neighborhood "modified" Modified LLE applies a regularization parameter to LLE. "isomap" Isomap seeks a lower dimensional embedding that maintains geometric distances between each instance. "mds" MDS: multi-dimensional scaling uses similarity to plot points that are near to each other close in the embedding. "spectral" Spectral Embedding a discrete approximation of the low dimensional manifold using a graph representation. "tsne" (default) t-SNE: converts the similarity of points into probabilities then uses those probabilities to create an embedding. ''' os.mkdir('clustering') os.chdir('clustering') # tSNE plt.figure() viz = Manifold(manifold="tsne", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="tsne.png") plt.close() # os.system('open tsne.png') # viz.show() # PCA plt.figure() visualizer = PCADecomposition(scale=True, classes=set(classes)) visualizer.fit_transform(np.array(features), tclass_labels) visualizer.poof(outpath="pca.png") plt.close() # os.system('open pca.png') # spectral embedding plt.figure() viz = Manifold(manifold="spectral", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="spectral.png") plt.close() # lle embedding plt.figure() viz = Manifold(manifold="lle", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="lle.png") plt.close() # ltsa # plt.figure() # viz = Manifold(manifold="ltsa", classes=set(classes)) # viz.fit_transform(np.array(features), tclass_labels) # viz.poof(outpath="ltsa.png") # plt.close() # hessian # plt.figure() # viz = Manifold(manifold="hessian", method='dense', classes=set(classes)) # viz.fit_transform(np.array(features), tclass_labels) # viz.poof(outpath="hessian.png") # plt.close() # modified plt.figure() viz = Manifold(manifold="modified", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="modified.png") plt.close() # isomap plt.figure() viz = Manifold(manifold="isomap", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="isomap.png") plt.close() # mds plt.figure() viz = Manifold(manifold="mds", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="mds.png") plt.close() # spectral plt.figure() viz = Manifold(manifold="spectral", classes=set(classes)) viz.fit_transform(np.array(features), tclass_labels) viz.poof(outpath="spectral.png") plt.close() # UMAP embedding plt.figure() umap = UMAPVisualizer(metric='cosine', classes=set(classes), title="UMAP embedding") umap.fit_transform(np.array(features), class_labels) umap.poof(outpath="umap.png") plt.close() # alternative UMAP # import umap.plot # plt.figure() # mapper = umap.UMAP().fit(np.array(features)) # fig=umap.plot.points(mapper, labels=np.array(tclass_labels)) # fig = fig.get_figure() # fig.tight_layout() # fig.savefig('umap2.png') # plt.close(fig) ################################# # FEATURE RANKING!! ################################# os.chdir(curdir) os.mkdir('feature_ranking') os.chdir('feature_ranking') # You can get the feature importance of each feature of your dataset # by using the feature importance property of the model. plt.figure(figsize=(12, 12)) model = ExtraTreesClassifier() model.fit(np.array(features), tclass_labels) # print(model.feature_importances_) feat_importances = pd.Series(model.feature_importances_, index=feature_labels[0]) feat_importances.nlargest(20).plot(kind='barh') plt.title('Feature importances (ExtraTrees)', size=16) plt.title('Feature importances with %s features' % (str(len(features[0])))) plt.tight_layout() plt.savefig('feature_importance.png') plt.close() # os.system('open feature_importance.png') # get selected labels for top 20 features selectedlabels = list(dict(feat_importances.nlargest(20))) new_features, new_labels = restructure_features(selectedlabels, t_features, feature_labels[0]) new_features_, new_labels_ = restructure_features(selectedlabels, features, feature_labels[0]) # Shapiro rank algorithm (1D) plt.figure(figsize=(28, 12)) visualizer = Rank1D(algorithm='shapiro', classes=set(classes), features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) # plt.tight_layout() visualizer.poof(outpath="shapiro.png") plt.title('Shapiro plot (top 20 features)', size=16) plt.close() # os.system('open shapiro.png') # visualizer.show() # pearson ranking algorithm (2D) plt.figure(figsize=(12, 12)) visualizer = Rank2D(algorithm='pearson', classes=set(classes), features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) plt.tight_layout() visualizer.poof(outpath="pearson.png") plt.title('Pearson ranking plot (top 20 features)', size=16) plt.close() # os.system('open pearson.png') # visualizer.show() # feature importances with top 20 features for Lasso plt.figure(figsize=(12, 12)) viz = FeatureImportances(Lasso(), labels=new_labels_) viz.fit(np.array(new_features_), tclass_labels) plt.tight_layout() viz.poof(outpath="lasso.png") plt.close() # correlation plots with feature removal if corr > 0.90 # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf # now remove correlated features # --> p values # --> https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e / https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb- plotly for correlation heatmap and scatterplot matrix # --> https://seaborn.pydata.org/tutorial/distributions.html data = new_features corr = data.corr() plt.figure(figsize=(12, 12)) fig = sns.heatmap(corr) fig = fig.get_figure() plt.title('Heatmap with correlated features (top 20 features)', size=16) fig.tight_layout() fig.savefig('heatmap.png') plt.close(fig) columns = np.full((corr.shape[0], ), True, dtype=bool) for i in range(corr.shape[0]): for j in range(i + 1, corr.shape[0]): if corr.iloc[i, j] >= 0.9: if columns[j]: columns[j] = False selected_columns = data.columns[columns] data = data[selected_columns] corr = data.corr() plt.figure(figsize=(12, 12)) fig = sns.heatmap(corr) fig = fig.get_figure() plt.title('Heatmap without correlated features (top 20 features)', size=16) fig.tight_layout() fig.savefig('heatmap_clean.png') plt.close(fig) # radviz # Instantiate the visualizer plt.figure(figsize=(12, 12)) visualizer = RadViz(classes=classes, features=new_labels) visualizer.fit(np.array(new_features), tclass_labels) visualizer.transform(np.array(new_features)) visualizer.poof(outpath="radviz.png") visualizer.show() plt.close() # feature correlation plot plt.figure(figsize=(28, 12)) visualizer = feature_correlation(np.array(new_features), tclass_labels, labels=new_labels) visualizer.poof(outpath="correlation.png") visualizer.show() plt.tight_layout() plt.close() os.mkdir('feature_plots') os.chdir('feature_plots') newdata = new_features_ newdata['classes'] = class_labels for j in range(len(new_labels_)): fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels_[j]]) fig = fig.get_figure() fig.tight_layout() fig.savefig('%s_%s.png' % (str(j), new_labels_[j])) plt.close(fig) os.mkdir('feature_plots_transformed') os.chdir('feature_plots_transformed') newdata = new_features newdata['classes'] = class_labels for j in range(len(new_labels)): fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels[j]]) fig = fig.get_figure() fig.tight_layout() fig.savefig('%s_%s.png' % (str(j), new_labels[j])) plt.close(fig) ################################################## # PRECISION-RECALL CURVES ################################################## os.chdir(curdir) os.mkdir('model_selection') os.chdir('model_selection') plt.figure() visualizer = precision_recall_curve(GaussianNB(), np.array(features), tclass_labels) visualizer.poof(outpath="precision-recall.png") plt.close() plt.figure() visualizer = roc_auc(LogisticRegression(), np.array(features), tclass_labels) visualizer.poof(outpath="roc_curve_train.png") plt.close() plt.figure() visualizer = discrimination_threshold( LogisticRegression(multi_class="auto", solver="liblinear"), np.array(features), tclass_labels) visualizer.poof(outpath="thresholds.png") plt.close() plt.figure() visualizer = residuals_plot(Ridge(), np.array(features), tclass_labels, train_color="maroon", test_color="gold") visualizer.poof(outpath="residuals.png") plt.close() plt.figure() visualizer = prediction_error(Lasso(), np.array(features), tclass_labels) visualizer.poof(outpath='prediction_error.png') plt.close() # outlier detection plt.figure() visualizer = cooks_distance(np.array(features), tclass_labels, draw_threshold=True, linefmt="C0-", markerfmt=",") visualizer.poof(outpath='outliers.png') plt.close() # cluster numbers plt.figure() visualizer = silhouette_visualizer( KMeans(len(set(tclass_labels)), random_state=42), np.array(features)) visualizer.poof(outpath='siloutte.png') plt.close() # cluster distance plt.figure() visualizer = intercluster_distance( KMeans(len(set(tclass_labels)), random_state=777), np.array(features)) visualizer.poof(outpath='cluster_distance.png') plt.close() # plot percentile of features plot with SVM to see which percentile for features is optimal features = preprocessing.MinMaxScaler().fit_transform(features) clf = Pipeline([('anova', SelectPercentile(chi2)), ('scaler', StandardScaler()), ('logr', LogisticRegression())]) score_means = list() score_stds = list() percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100) for percentile in percentiles: clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, np.array(features), class_labels) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Performance of the LogisticRegression-Anova varying the percent features selected' ) plt.xticks(np.linspace(0, 100, 11, endpoint=True)) plt.xlabel('Percentile') plt.ylabel('Accuracy Score') plt.axis('tight') plt.savefig('logr_percentile_plot.png') plt.close() # get PCA pca = PCA(random_state=1) pca.fit(X_train) skplt.decomposition.plot_pca_component_variance(pca) plt.savefig('pca_explained_variance.png') plt.close() # estimators rf = RandomForestClassifier() skplt.estimators.plot_learning_curve(rf, X_train, y_train) plt.title('Learning Curve (Random Forest)') plt.savefig('learning_curve.png') plt.close() # elbow plot kmeans = KMeans(random_state=1) skplt.cluster.plot_elbow_curve(kmeans, X_train, cluster_ranges=range(1, 30), title='Elbow plot (KMeans clustering)') plt.savefig('elbow.png') plt.close() # KS statistic (only if 2 classes) lr = LogisticRegression() lr = lr.fit(X_train, y_train) y_probas = lr.predict_proba(X_test) skplt.metrics.plot_ks_statistic(y_test, y_probas) plt.savefig('ks.png') plt.close() # precision-recall nb = GaussianNB() nb.fit(X_train, y_train) y_probas = nb.predict_proba(X_test) skplt.metrics.plot_precision_recall(y_test, y_probas) plt.tight_layout() plt.savefig('precision-recall.png') plt.close() ## plot calibration curve rf = RandomForestClassifier() lr = LogisticRegression() nb = GaussianNB() svm = LinearSVC() dt = DecisionTreeClassifier(random_state=0) ab = AdaBoostClassifier(n_estimators=100) gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) knn = KNeighborsClassifier(n_neighbors=7) rf_probas = rf.fit(X_train, y_train).predict_proba(X_test) lr_probas = lr.fit(X_train, y_train).predict_proba(X_test) nb_probas = nb.fit(X_train, y_train).predict_proba(X_test) # svm_scores = svm.fit(X_train, y_train).predict_proba(X_test) dt_scores = dt.fit(X_train, y_train).predict_proba(X_test) ab_scores = ab.fit(X_train, y_train).predict_proba(X_test) gb_scores = gb.fit(X_train, y_train).predict_proba(X_test) knn_scores = knn.fit(X_train, y_train).predict_proba(X_test) probas_list = [ rf_probas, lr_probas, nb_probas, # svm_scores, dt_scores, ab_scores, gb_scores, knn_scores ] clf_names = [ 'Random Forest', 'Logistic Regression', 'Gaussian NB', # 'SVM', 'Decision Tree', 'Adaboost', 'Gradient Boost', 'KNN' ] skplt.metrics.plot_calibration_curve(y_test, probas_list, clf_names) plt.savefig('calibration.png') plt.tight_layout() plt.close() # pick classifier type by ROC (without optimization) probs = [ rf_probas[:, 1], lr_probas[:, 1], nb_probas[:, 1], # svm_scores[:, 1], dt_scores[:, 1], ab_scores[:, 1], gb_scores[:, 1], knn_scores[:, 1] ] plot_roc_curve(y_test, probs, clf_names) # more elaborate ROC example with CV = 5 fold # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py os.chdir(curdir) return ''