Python roc_auc示例

编程语言: Python

命名空间/包名称: yellowbrick.classifier.rocauc

方法/功能: roc_auc

hotexamples.com的示例: 4

Python roc_auc - 已找到4个示例。这些是从开源项目中提取的最受好评的yellowbrick.classifier.rocauc.roc_auc现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

lr1.fit(X_train,y_train)
y_lr1 = lr1.predict(X_test)

params

lr1.score(X_train,y_train)

lr1.score(X_test,y_test)

generate_model_report(y_test,y_lr1)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_lr1, labels=['Low_Damage', 'Medium_Damage', 'High_Damage']))

from yellowbrick.classifier.rocauc import roc_auc
roc_auc(lr1, X_train, y_train, X_test=X_test, y_test=y_test, classes=["Low_damage","Medium_damage","High_damage"])

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0, multi_class = "multinomial", solver="newton-cg",C=1000)
lr.get_params()

lr.fit(X_train,y_train)

y_lr = lr.predict(X_test)

generate_model_report(y_test, y_lr)

y_prob = lr.predict_proba(X_test)
y_prob

lr.score(X_train,y_train)

示例#2

显示文件

文件： functions.py 项目： tamjid-ahsan/dsc-phase-3-project

def model_report(model,
                 X_train,
                 y_train,
                 X_test,
                 y_test,
                 cmap=['Reds', 'Greens'],
                 normalize='true',
                 figsize=(16, 6),
                 show_train_report=False,
                 unfitted_model=True):
    """
    Report of model performance using train-test split dataset.
    Shows train and test score, Confusion Matrix and, ROC Curve of performane of test data.
    
    Intended to work ONLY on model where target has properly encoded binomial class value.
    
    Parameters:
    ===========
    model     = object, scikit-learn model object; no default.
    X_train   = pandas.DataFrame, predictor variable training data split; no default,
    y_train   = pandas.DataFrame, target variable training data split; no default,
    X_test    = pandas.DataFrame, predictor variable test data split; no default,
    y_test    = pandas.DataFrame, target variable test data split; no default,
    cmap      = list of str, colormap of Confusion Matrix; default: ['Reds','Greens'],
                cmap of train and test data
    normalize = str, normalize count of Confusion Matrix; default: 'true',
                - `true` to normalize counts.
                - `false` to show raw scounts.
    figsize   = tuple ``(lenght, height)``, figsize of output; default: (16, 6),
    show_train_report = boolean; default: False,
                - True, to show report.
                - False, to turn off report.
    unfitted_model = bool; default: True,
                - if True, fits model to train data and generates report.
                - if False, does not fits model and generates report.
                Use False for previously fitted model.

    ---version 0.9.15---
    """
    def str_model_(model):
        """Helper function to get model class display statement, this text conversion breaks code if 
        performed in ``model_report`` function's local space. This function is to isolate from the 
        previous function's local space."""
        str_model = str(model.__class__).split('.')[-1][:-2]
        display(
            HTML(
                f"""<strong>Report of {str_model} type model using train-test split dataset.</strong>"""
            ))

    str_model_(model)
    X_train = X_train.copy()
    y_train = y_train.copy()
    if unfitted_model:
        model.fit(X_train, y_train)
    print(f"{'*'*90}")
    train = model.score(X_train, y_train)
    test = model.score(X_test, y_test)
    print(f"""Train accuracy score: {train.round(4)}""")
    print(f"""Test accuracy score: {test.round(4)}""")
    if abs(train - test) <= .05:
        print(
            f"    No over or underfitting detected, diffrence of scores did not cross 5% thresh hold."
        )
    elif (train - test) > .05:
        print(
            f"    Possible Overfitting, diffrence of scores {round(abs(train-test)*100,2)}% crossed 5% thresh hold."
        )
    elif (train - test) < -.05:
        print(
            f"    Possible Underfitting, diffrence of scores {round(abs(train-test)*100,2)}% crossed 5% thresh hold."
        )
    print(f"{'*'*90}")
    print("")
    print(f"{'*'*60}")

    if show_train_report:
        print(f"""Classification report on train data of:
        {model}""")
        print(f"{'-'*60}")
        print(metrics.classification_report(y_train, model.predict(X_train)))
        print(f"{'*'*60}")
        fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=figsize)
        metrics.plot_confusion_matrix(model,
                                      X_train,
                                      y_train,
                                      cmap=cmap[0],
                                      normalize=normalize,
                                      ax=ax1)
        ax1.title.set_text('Confusion Matrix')

        _ = roc_auc(model,
                    X_train,
                    y_train,
                    classes=None,
                    is_fitted=True,
                    show=False,
                    ax=ax2)

        # ax[1].plot([0, 1], [0, 1], ls='-.', color='white')
        ax2.grid()
        ax2.title.set_text('ROC Curve')

        plt.tight_layout()
        plt.show()
        print(f"{'='*170}")
        print(f"{'*'*60}")

    print(f"""Classification report on test data of:
    {model}""")
    print(f"{'-'*60}")
    print(metrics.classification_report(y_test, model.predict(X_test)))
    print(f"{'*'*60}")

    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=figsize)
    metrics.plot_confusion_matrix(model,
                                  X_test,
                                  y_test,
                                  cmap=cmap[1],
                                  normalize=normalize,
                                  ax=ax1)
    ax1.title.set_text('Confusion Matrix')

    _ = roc_auc(model,
                X_test,
                y_test,
                classes=None,
                is_fitted=True,
                show=False,
                ax=ax2)
    ax2.grid()
    ax2.title.set_text('ROC Curve')

    plt.tight_layout()
    plt.show()

示例#3

显示文件

def score_model_outcome(X_train, y_train, X_test, y_test, model, **kwargs):
    """ A function that returns the different metrics of accuracy, confusion matrix and other model reports depending on the type of model that is asked.
    
    This function is for prognosis

    Parameters
    ----------
    X_train: matrix of training features
    
    y_train: vector of training labels
    
    X_test: matrix of test features
    
    y_test: vector of test labels

    Returns
    -------
    
    - Accuracy, F1 score and ROC_AUC for the train and test set
    
    - Confusion matrix
    
    - ClassificationReport
    
    - PrecisionRecallCurve
    
    - ClassPredictionError
    
    """

    # Train the model
    model.fit(X_train, y_train, **kwargs)

    # Predict on the train set
    prediction_train = model.predict(X_train)

    # Compute metrics for the train set
    accuracy_train = accuracy_score(y_train, prediction_train)

    # False Positive Rate, True Positive Rate, Threshold
    fpr_train, tpr_train, thresholds_train = roc_curve(y_train,
                                                       prediction_train)
    auc_train = auc(fpr_train, tpr_train)

    f1_score_train = f1_score(y_train, prediction_train)

    # Predict on the test set
    prediction_test = model.predict(X_test)

    accuracy_test = accuracy_score(y_test, prediction_test)

    fpr_test, tpr_test, thresholds_test = roc_curve(y_test, prediction_test)
    auc_test = auc(fpr_test, tpr_test)

    f1_score_test = f1_score(y_test, prediction_test)

    print("{}:".format(model.__class__.__name__))
    # Compute and return F1 (harmonic mean of precision and recall)
    print(
        "On training we get an Accuracy {}, an AUC {} and F1 score {} ".format(
            accuracy_train, auc_train, f1_score_train))

    print("For test we get an Accuracy {}, an AUC {} and F1 score {}".format(
        accuracy_test, auc_test, f1_score_test))

    fig, axes = plt.subplots(3, 2, figsize=(20, 20))

    visualgrid = [
        ConfusionMatrix(model,
                        ax=axes[0][0],
                        classes=['Death', 'Survival'],
                        cmap="YlGnBu"),
        ClassificationReport(
            model,
            ax=axes[0][1],
            classes=['Death', 'Survival'],
            cmap="YlGn",
        ),
        PrecisionRecallCurve(model, ax=axes[1][0]),
        ClassPredictionError(model,
                             classes=['Death', 'Survival'],
                             ax=axes[1][1]),
    ]

    for viz in visualgrid:
        viz.fit(X_train, y_train)
        viz.score(X_test, y_test)
        viz.finalize()

    try:
        roc_auc(model,
                X_train,
                y_train,
                X_test=X_test,
                y_test=y_test,
                classes=['Death', 'Survival'],
                ax=axes[2][0])
    except:
        print('Can plot ROC curve for this model')

    try:
        viz = FeatureImportances(model,
                                 ax=axes[2][1],
                                 stack=True,
                                 relative=False)
        viz.fit(X_train, y_train)
        viz.score(X_test, y_test)
        viz.finalize()
    except:
        print('Don\'t have feature importance')

    plt.show()
    print('\n')

示例#4

显示文件

def visualize_features(classes, problem_type, curdir, default_features,
                       balance_data, test_size):

    # make features into label encoder here
    features, feature_labels, class_labels = get_features(
        classes, problem_type, default_features, balance_data)

    # now preprocess features for all the other plots
    os.chdir(curdir)
    le = preprocessing.LabelEncoder()
    le.fit(class_labels)
    tclass_labels = le.transform(class_labels)

    # process features to help with clustering
    se = preprocessing.StandardScaler()
    t_features = se.fit_transform(features)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        tclass_labels,
                                                        test_size=test_size,
                                                        random_state=42)

    # print(len(features))
    # print(len(feature_labels))
    # print(len(class_labels))
    # print(class_labels)

    # GET TRAINING DATA DURING MODELING PROCESS
    ##################################
    # get filename
    # csvfile=''
    # print(classes)
    # for i in range(len(classes)):
    # 	csvfile=csvfile+classes[i]+'_'

    # get training and testing data for later
    # try:
    # print('loading training files...')
    # X_train=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'train.csv')
    # y_train=X_train['class_']
    # X_train.drop(['class_'], axis=1)
    # X_test=pd.read_csv(prev_dir(curdir)+'/models/'+csvfile+'test.csv')
    # y_test=X_test['class_']
    # X_test.drop(['class_'], axis=1)
    # y_train=le.inverse_transform(y_train)
    # y_test=le.inverse_transform(y_test)
    # except:
    # print('error loading in training files, making new test data')

    # Visualize each class (quick plot)
    ##################################
    visualization_dir = 'visualization_session'
    try:
        os.mkdir(visualization_dir)
        os.chdir(visualization_dir)
    except:
        shutil.rmtree(visualization_dir)
        os.mkdir(visualization_dir)
        os.chdir(visualization_dir)

    objects = tuple(set(class_labels))
    y_pos = np.arange(len(objects))
    performance = list()
    for i in range(len(objects)):
        performance.append(class_labels.count(objects[i]))

    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.xticks(rotation=90)
    plt.title('Counts per class')
    plt.ylabel('Count')
    plt.xlabel('Class')
    plt.tight_layout()
    plt.savefig('classes.png')
    plt.close()

    # set current directory
    curdir = os.getcwd()

    # ##################################
    # # CLUSTERING!!!
    # ##################################

    ##################################
    # Manifold type options
    ##################################
    '''
		"lle"
		Locally Linear Embedding (LLE) uses many local linear decompositions to preserve globally non-linear structures.
		"ltsa"
		LTSA LLE: local tangent space alignment is similar to LLE in that it uses locality to preserve neighborhood distances.
		"hessian"
		Hessian LLE an LLE regularization method that applies a hessian-based quadratic form at each neighborhood
		"modified"
		Modified LLE applies a regularization parameter to LLE.
		"isomap"
		Isomap seeks a lower dimensional embedding that maintains geometric distances between each instance.
		"mds"
		MDS: multi-dimensional scaling uses similarity to plot points that are near to each other close in the embedding.
		"spectral"
		Spectral Embedding a discrete approximation of the low dimensional manifold using a graph representation.
		"tsne" (default)
		t-SNE: converts the similarity of points into probabilities then uses those probabilities to create an embedding.
	'''
    os.mkdir('clustering')
    os.chdir('clustering')

    # tSNE
    plt.figure()
    viz = Manifold(manifold="tsne", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="tsne.png")
    plt.close()
    # os.system('open tsne.png')
    # viz.show()

    # PCA
    plt.figure()
    visualizer = PCADecomposition(scale=True, classes=set(classes))
    visualizer.fit_transform(np.array(features), tclass_labels)
    visualizer.poof(outpath="pca.png")
    plt.close()
    # os.system('open pca.png')

    # spectral embedding
    plt.figure()
    viz = Manifold(manifold="spectral", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="spectral.png")
    plt.close()

    # lle embedding
    plt.figure()
    viz = Manifold(manifold="lle", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="lle.png")
    plt.close()

    # ltsa
    # plt.figure()
    # viz = Manifold(manifold="ltsa", classes=set(classes))
    # viz.fit_transform(np.array(features), tclass_labels)
    # viz.poof(outpath="ltsa.png")
    # plt.close()

    # hessian
    # plt.figure()
    # viz = Manifold(manifold="hessian", method='dense', classes=set(classes))
    # viz.fit_transform(np.array(features), tclass_labels)
    # viz.poof(outpath="hessian.png")
    # plt.close()

    # modified
    plt.figure()
    viz = Manifold(manifold="modified", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="modified.png")
    plt.close()

    # isomap
    plt.figure()
    viz = Manifold(manifold="isomap", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="isomap.png")
    plt.close()

    # mds
    plt.figure()
    viz = Manifold(manifold="mds", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="mds.png")
    plt.close()

    # spectral
    plt.figure()
    viz = Manifold(manifold="spectral", classes=set(classes))
    viz.fit_transform(np.array(features), tclass_labels)
    viz.poof(outpath="spectral.png")
    plt.close()

    # UMAP embedding
    plt.figure()
    umap = UMAPVisualizer(metric='cosine',
                          classes=set(classes),
                          title="UMAP embedding")
    umap.fit_transform(np.array(features), class_labels)
    umap.poof(outpath="umap.png")
    plt.close()

    # alternative UMAP
    # import umap.plot
    # plt.figure()
    # mapper = umap.UMAP().fit(np.array(features))
    # fig=umap.plot.points(mapper, labels=np.array(tclass_labels))
    # fig = fig.get_figure()
    # fig.tight_layout()
    # fig.savefig('umap2.png')
    # plt.close(fig)

    #################################
    # 	  FEATURE RANKING!!
    #################################
    os.chdir(curdir)
    os.mkdir('feature_ranking')
    os.chdir('feature_ranking')

    # You can get the feature importance of each feature of your dataset
    # by using the feature importance property of the model.
    plt.figure(figsize=(12, 12))
    model = ExtraTreesClassifier()
    model.fit(np.array(features), tclass_labels)
    # print(model.feature_importances_)
    feat_importances = pd.Series(model.feature_importances_,
                                 index=feature_labels[0])
    feat_importances.nlargest(20).plot(kind='barh')
    plt.title('Feature importances (ExtraTrees)', size=16)
    plt.title('Feature importances with %s features' % (str(len(features[0]))))
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    # os.system('open feature_importance.png')

    # get selected labels for top 20 features
    selectedlabels = list(dict(feat_importances.nlargest(20)))
    new_features, new_labels = restructure_features(selectedlabels, t_features,
                                                    feature_labels[0])
    new_features_, new_labels_ = restructure_features(selectedlabels, features,
                                                      feature_labels[0])

    # Shapiro rank algorithm (1D)
    plt.figure(figsize=(28, 12))
    visualizer = Rank1D(algorithm='shapiro',
                        classes=set(classes),
                        features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    # plt.tight_layout()
    visualizer.poof(outpath="shapiro.png")
    plt.title('Shapiro plot (top 20 features)', size=16)
    plt.close()
    # os.system('open shapiro.png')
    # visualizer.show()

    # pearson ranking algorithm (2D)
    plt.figure(figsize=(12, 12))
    visualizer = Rank2D(algorithm='pearson',
                        classes=set(classes),
                        features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    plt.tight_layout()
    visualizer.poof(outpath="pearson.png")
    plt.title('Pearson ranking plot (top 20 features)', size=16)
    plt.close()
    # os.system('open pearson.png')
    # visualizer.show()

    # feature importances with top 20 features for Lasso
    plt.figure(figsize=(12, 12))
    viz = FeatureImportances(Lasso(), labels=new_labels_)
    viz.fit(np.array(new_features_), tclass_labels)
    plt.tight_layout()
    viz.poof(outpath="lasso.png")
    plt.close()

    # correlation plots with feature removal if corr > 0.90
    # https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf

    # now remove correlated features
    # --> p values
    # --> https://towardsdatascience.com/the-next-level-of-data-visualization-in-python-dd6e99039d5e / https://github.com/WillKoehrsen/Data-Analysis/blob/master/plotly/Plotly%20Whirlwind%20Introduction.ipynb- plotly for correlation heatmap and scatterplot matrix
    # --> https://seaborn.pydata.org/tutorial/distributions.html
    data = new_features
    corr = data.corr()

    plt.figure(figsize=(12, 12))
    fig = sns.heatmap(corr)
    fig = fig.get_figure()
    plt.title('Heatmap with correlated features (top 20 features)', size=16)
    fig.tight_layout()
    fig.savefig('heatmap.png')
    plt.close(fig)

    columns = np.full((corr.shape[0], ), True, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= 0.9:
                if columns[j]:
                    columns[j] = False
    selected_columns = data.columns[columns]
    data = data[selected_columns]
    corr = data.corr()

    plt.figure(figsize=(12, 12))
    fig = sns.heatmap(corr)
    fig = fig.get_figure()
    plt.title('Heatmap without correlated features (top 20 features)', size=16)
    fig.tight_layout()
    fig.savefig('heatmap_clean.png')
    plt.close(fig)

    # radviz
    # Instantiate the visualizer
    plt.figure(figsize=(12, 12))
    visualizer = RadViz(classes=classes, features=new_labels)
    visualizer.fit(np.array(new_features), tclass_labels)
    visualizer.transform(np.array(new_features))
    visualizer.poof(outpath="radviz.png")
    visualizer.show()
    plt.close()

    # feature correlation plot
    plt.figure(figsize=(28, 12))
    visualizer = feature_correlation(np.array(new_features),
                                     tclass_labels,
                                     labels=new_labels)
    visualizer.poof(outpath="correlation.png")
    visualizer.show()
    plt.tight_layout()
    plt.close()

    os.mkdir('feature_plots')
    os.chdir('feature_plots')

    newdata = new_features_
    newdata['classes'] = class_labels

    for j in range(len(new_labels_)):
        fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels_[j]])
        fig = fig.get_figure()
        fig.tight_layout()
        fig.savefig('%s_%s.png' % (str(j), new_labels_[j]))
        plt.close(fig)

    os.mkdir('feature_plots_transformed')
    os.chdir('feature_plots_transformed')

    newdata = new_features
    newdata['classes'] = class_labels

    for j in range(len(new_labels)):
        fig = sns.violinplot(x=newdata['classes'], y=newdata[new_labels[j]])
        fig = fig.get_figure()
        fig.tight_layout()
        fig.savefig('%s_%s.png' % (str(j), new_labels[j]))
        plt.close(fig)

    ##################################################
    # PRECISION-RECALL CURVES
    ##################################################

    os.chdir(curdir)
    os.mkdir('model_selection')
    os.chdir('model_selection')

    plt.figure()
    visualizer = precision_recall_curve(GaussianNB(), np.array(features),
                                        tclass_labels)
    visualizer.poof(outpath="precision-recall.png")
    plt.close()

    plt.figure()
    visualizer = roc_auc(LogisticRegression(), np.array(features),
                         tclass_labels)
    visualizer.poof(outpath="roc_curve_train.png")
    plt.close()

    plt.figure()
    visualizer = discrimination_threshold(
        LogisticRegression(multi_class="auto", solver="liblinear"),
        np.array(features), tclass_labels)
    visualizer.poof(outpath="thresholds.png")
    plt.close()

    plt.figure()
    visualizer = residuals_plot(Ridge(),
                                np.array(features),
                                tclass_labels,
                                train_color="maroon",
                                test_color="gold")
    visualizer.poof(outpath="residuals.png")
    plt.close()

    plt.figure()
    visualizer = prediction_error(Lasso(), np.array(features), tclass_labels)
    visualizer.poof(outpath='prediction_error.png')
    plt.close()

    # outlier detection
    plt.figure()
    visualizer = cooks_distance(np.array(features),
                                tclass_labels,
                                draw_threshold=True,
                                linefmt="C0-",
                                markerfmt=",")
    visualizer.poof(outpath='outliers.png')
    plt.close()

    # cluster numbers
    plt.figure()
    visualizer = silhouette_visualizer(
        KMeans(len(set(tclass_labels)), random_state=42), np.array(features))
    visualizer.poof(outpath='siloutte.png')
    plt.close()

    # cluster distance
    plt.figure()
    visualizer = intercluster_distance(
        KMeans(len(set(tclass_labels)), random_state=777), np.array(features))
    visualizer.poof(outpath='cluster_distance.png')
    plt.close()

    # plot percentile of features plot with SVM to see which percentile for features is optimal
    features = preprocessing.MinMaxScaler().fit_transform(features)
    clf = Pipeline([('anova', SelectPercentile(chi2)),
                    ('scaler', StandardScaler()),
                    ('logr', LogisticRegression())])
    score_means = list()
    score_stds = list()
    percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100)

    for percentile in percentiles:
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, np.array(features), class_labels)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())

    plt.errorbar(percentiles, score_means, np.array(score_stds))
    plt.title(
        'Performance of the LogisticRegression-Anova varying the percent features selected'
    )
    plt.xticks(np.linspace(0, 100, 11, endpoint=True))
    plt.xlabel('Percentile')
    plt.ylabel('Accuracy Score')
    plt.axis('tight')
    plt.savefig('logr_percentile_plot.png')
    plt.close()

    # get PCA
    pca = PCA(random_state=1)
    pca.fit(X_train)
    skplt.decomposition.plot_pca_component_variance(pca)
    plt.savefig('pca_explained_variance.png')
    plt.close()

    # estimators
    rf = RandomForestClassifier()
    skplt.estimators.plot_learning_curve(rf, X_train, y_train)
    plt.title('Learning Curve (Random Forest)')
    plt.savefig('learning_curve.png')
    plt.close()

    # elbow plot
    kmeans = KMeans(random_state=1)
    skplt.cluster.plot_elbow_curve(kmeans,
                                   X_train,
                                   cluster_ranges=range(1, 30),
                                   title='Elbow plot (KMeans clustering)')
    plt.savefig('elbow.png')
    plt.close()

    # KS statistic (only if 2 classes)
    lr = LogisticRegression()
    lr = lr.fit(X_train, y_train)
    y_probas = lr.predict_proba(X_test)
    skplt.metrics.plot_ks_statistic(y_test, y_probas)
    plt.savefig('ks.png')
    plt.close()

    # precision-recall
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    y_probas = nb.predict_proba(X_test)
    skplt.metrics.plot_precision_recall(y_test, y_probas)
    plt.tight_layout()
    plt.savefig('precision-recall.png')
    plt.close()

    ## plot calibration curve
    rf = RandomForestClassifier()
    lr = LogisticRegression()
    nb = GaussianNB()
    svm = LinearSVC()
    dt = DecisionTreeClassifier(random_state=0)
    ab = AdaBoostClassifier(n_estimators=100)
    gb = GradientBoostingClassifier(n_estimators=100,
                                    learning_rate=1.0,
                                    max_depth=1,
                                    random_state=0)
    knn = KNeighborsClassifier(n_neighbors=7)

    rf_probas = rf.fit(X_train, y_train).predict_proba(X_test)
    lr_probas = lr.fit(X_train, y_train).predict_proba(X_test)
    nb_probas = nb.fit(X_train, y_train).predict_proba(X_test)
    # svm_scores = svm.fit(X_train, y_train).predict_proba(X_test)
    dt_scores = dt.fit(X_train, y_train).predict_proba(X_test)
    ab_scores = ab.fit(X_train, y_train).predict_proba(X_test)
    gb_scores = gb.fit(X_train, y_train).predict_proba(X_test)
    knn_scores = knn.fit(X_train, y_train).predict_proba(X_test)

    probas_list = [
        rf_probas,
        lr_probas,
        nb_probas,  # svm_scores,
        dt_scores,
        ab_scores,
        gb_scores,
        knn_scores
    ]

    clf_names = [
        'Random Forest',
        'Logistic Regression',
        'Gaussian NB',  # 'SVM',
        'Decision Tree',
        'Adaboost',
        'Gradient Boost',
        'KNN'
    ]

    skplt.metrics.plot_calibration_curve(y_test, probas_list, clf_names)
    plt.savefig('calibration.png')
    plt.tight_layout()
    plt.close()

    # pick classifier type by ROC (without optimization)
    probs = [
        rf_probas[:, 1],
        lr_probas[:, 1],
        nb_probas[:, 1],  # svm_scores[:, 1],
        dt_scores[:, 1],
        ab_scores[:, 1],
        gb_scores[:, 1],
        knn_scores[:, 1]
    ]

    plot_roc_curve(y_test, probs, clf_names)
    # more elaborate ROC example with CV = 5 fold
    # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc_crossval.html#sphx-glr-auto-examples-model-selection-plot-roc-crossval-py

    os.chdir(curdir)

    return ''