예제 #1
0
def show_confusion_matrix():
    # prepare sample data and target variable
    labels = None
    features = None
    D = BreastCancerData(features, labels)
    X, y = D.X, D.y

    # split sample data into training data and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1,
                                                        stratify=y)

    # make and fit pipeline
    pipeline = make_pipeline(StandardScaler(), SVC(random_state=1))
    pipeline.fit(X_train, y_train)

    # compute confusion matrix
    y_pred = pipeline.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    # visualize confusion matrix
    _, ax = plt.subplots(figsize=(2.5, 2.5))
    ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.3)
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax.text(x=i, y=j, s=cm[i, j], va='center', ha='center')
    plt.xlabel('predicted label')
    plt.ylabel('true label')
    plt.tight_layout()
    plt.show()
예제 #2
0
def show_roc_curve():
    # prepare sample data and target variable
    labels = None
    features = None
    D = BreastCancerData(features, labels)
    X, y = D.X, D.y

    # split sample data into training data and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1,
                                                        stratify=y)

    # make and pipeline
    pipeline = make_pipeline(
        StandardScaler(), PCA(n_components=2),
        LogisticRegression(solver='liblinear', C=100, random_state=1))

    # extract features for ROC curve
    X_train_extracted = X_train[:, [4, 14]]

    mean_fpr = np.linspace(0, 1, 100)
    mean_tpr_list = []
    kfold = StratifiedKFold(n_splits=3, random_state=1)
    for i, (idx_train, _) in enumerate(kfold.split(X_train, y_train), start=1):
        # compute fpr (false positive rate) and tpr (true positive rate)
        probas = pipeline.fit(X_train_extracted[idx_train],
                              y_train[idx_train]).predict_proba(
                                  X_train_extracted[idx_train])
        fpr, tpr, _ = roc_curve(y_train[idx_train], probas[:, 1], pos_label=1)
        # save interpolation of tpr at fpr in order to compute mean tpr
        mean_tpr_list.append(scipy.interp(mean_fpr, fpr, tpr))
        # plot ROC curve of the current training datasets
        plt.plot(fpr,
                 tpr,
                 label='ROC fold {0} (AUC = {1:f})'.format(i, auc(fpr, tpr)))
    # plot mean ROC curve
    mean_tpr = np.mean(mean_tpr_list, axis=0)
    mean_tpr[0], mean_tpr[-1] = 0, 1
    plt.plot(mean_fpr,
             mean_tpr,
             'k--',
             label='mean ROC (AUC = {0:f})'.format(auc(mean_fpr, mean_tpr)))
    # plot random guess and perfect estimator
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black')
    # set plot area and show all the plots
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.legend(loc='lower right')
    plt.show()
예제 #3
0
def main():
    # prepare sample data and target variable
    labels = None
    features = None
    D = BreastCancerData(features, labels)
    X, y = D.X, D.y

    # split sample data into training data and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1,
                                                        stratify=y)

    # make and pipeline
    pipeline = make_pipeline(
        StandardScaler(), PCA(n_components=2),
        LogisticRegression(solver='liblinear', random_state=1))

    # explicitly execute stratified k-fold cross validation
    kfold = StratifiedKFold(n_splits=10, random_state=1)
    scores = []
    for k, (idx_train, idx_test) in enumerate(kfold.split(X_train, y_train),
                                              start=1):
        pipeline.fit(X_train[idx_train], y_train[idx_train])
        score = pipeline.score(X_train[idx_test], y_train[idx_test])
        scores.append(score)
        print('fold: {0:2d} | class distribution: {1} | score: {2:f}'.format(
            k,
            np.bincount(y_train[idx_train]) / len(y_train[idx_train]), score))
    print('CV accuracy: {0:f} +/- {1:f}'.format(np.mean(scores),
                                                np.std(scores)))

    # use cross_val_score function for cross validation
    scores = cross_val_score(estimator=pipeline,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             n_jobs=1)
    for k, score in enumerate(scores, start=1):
        print('fold: {0:2d} | score: {1:f}'.format(k, score))
    print('CV accuracy: {0:f} +/- {1:f}'.format(np.mean(scores),
                                                np.std(scores)))

    # compute the final score
    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_test, y_test)
    print('final score:', score)
예제 #4
0
def main():
    # prepare sample data and target variable
    labels = None
    features = None
    D = BreastCancerData(features, labels)
    X, y = D.X, D.y

    # split sample data into training data and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

    # make pipeline
    pipeline = make_pipeline(
        StandardScaler(),
        SVC(random_state=1))

    # execute grid search
    param_range = list(10**n for n in range(-4, 4))
    param_grid = [
        {'svc__C':param_range, 'svc__kernel':['linear']},
        {'svc__C':param_range, 'svc__gamma':param_range, 'svc__kernel':['rbf']}]
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='accuracy', cv=10)
    grid_search = grid_search.fit(X_train, y_train)
    print('best score:', grid_search.best_score_)
    print('best parameters:', grid_search.best_params_)

    # compute the final score
    estimator = grid_search.best_estimator_.fit(X_train, y_train)
    score = estimator.score(X_test, y_test)
    print('final score:', score)

    # execute nested cross validation (5x2 cross validation)
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='accuracy', cv=2)
    scores = cross_val_score(grid_search, X_train, y_train, scoring='accuracy', cv=5)
    print('CV accuracy (SVM): {0:f} +/- {1:f}'.format(np.mean(scores), np.std(scores)))
    grid_search = GridSearchCV(
        estimator=DecisionTreeClassifier(random_state=1),
        param_grid=[{'max_depth':[1, 2, 3, 4, 5, 6, 7, None]}],
        scoring='accuracy', cv=2)
    scores = cross_val_score(grid_search, X_train, y_train, scoring='accuracy', cv=5)
    print('CV accuracy (decision tree): {0:f} +/- {1:f}'.format(np.mean(scores), np.std(scores)))
예제 #5
0
def main():
    # prepare sample data and target variable
    labels = None
    features = None
    D = BreastCancerData(features, labels)
    X, y = D.X, D.y

    # split sample data into training data and test data
    X_train, _, y_train, _ = train_test_split(X,
                                              y,
                                              test_size=0.2,
                                              random_state=1,
                                              stratify=y)

    # make and pipeline
    pipeline = make_pipeline(
        StandardScaler(), PCA(n_components=2),
        LogisticRegression(solver='liblinear', C=100, random_state=1))

    # extract features and compute scores for ROC curve
    X_train_extracted = X_train[:, [4, 14]]
    probas = pipeline.fit(X_train_extracted,
                          y_train).predict_proba(X_train_extracted)

    roc_functions = (metric_utility.roc_curve, roc_curve)
    auc_functions = (metric_utility.auc, auc)

    for i, (roc_func, auc_func) in enumerate(zip(roc_functions, auc_functions),
                                             start=1):
        # compute fpr (false positive rate) and tpr (true positive rate)
        fpr, tpr, _ = roc_func(y_train, probas[:, 1], pos_label=1)
        # plot ROC curve
        plt.plot(fpr,
                 tpr,
                 label='ROC {0} (AUC = {1:f})'.format(i, auc_func(fpr, tpr)))

    # plot random guess and perfect estimator
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black')
    # set plot area and show all the plots
    plt.xlabel('false positive rate')
    plt.ylabel('true positive rate')
    plt.legend(loc='lower right')
    plt.show()
예제 #6
0
def show_evaluation_scores():
    # prepare sample data and target variable
    labels = None
    features = None
    D = BreastCancerData(features, labels)
    X, y = D.X, D.y

    # split sample data into training data and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1,
                                                        stratify=y)

    # make and fit pipeline
    pipeline = make_pipeline(StandardScaler(), SVC(random_state=1))
    pipeline.fit(X_train, y_train)

    # compute evaluation scores
    y_pred = pipeline.predict(X_test)
    print('precision score:', precision_score(y_test, y_pred))
    print('recall score:', recall_score(y_test, y_pred))
    print('f1 score:', f1_score(y_test, y_pred))

    # execute grid search with a custom evaluation score
    scorer = make_scorer(f1_score)
    param_range = list(10**n for n in range(-4, 4))
    param_grid = [{
        'svc__C': param_range,
        'svc__kernel': ['linear']
    }, {
        'svc__C': param_range,
        'svc__gamma': param_range,
        'svc__kernel': ['rbf']
    }]
    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=param_grid,
                               scoring=scorer,
                               cv=10)
    grid_search = grid_search.fit(X_train, y_train)
    print('best score:', grid_search.best_score_)
    print('best parameters:', grid_search.best_params_)
def main():
    # prepare sample data and target variable
    labels = None
    features = None
    D = BreastCancerData(features, labels)
    X, y = D.X, D.y

    # split sample data into training data and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1,
                                                        stratify=y)

    # make and fit pipeline
    pipeline = make_pipeline(
        StandardScaler(), PCA(n_components=2),
        LogisticRegression(solver='liblinear', random_state=1))
    pipeline.fit(X_train, y_train)

    # show accuracy
    y_pred = pipeline.predict(X_test)
    print('misclassified samples: {}'.format(np.sum(y_test != y_pred)))
def main():
    # prepare sample data and target variable
    labels = None
    features = None
    D = BreastCancerData(features, labels)
    X, y = D.X, D.y

    # split sample data into training data and test data
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1,
                                                        stratify=y)

    # make and pipeline
    pipeline = make_pipeline(
        StandardScaler(), PCA(n_components=2),
        LogisticRegression(solver='liblinear', random_state=1))

    # show learning curve
    train_sizes, train_scores, test_scores = learning_curve(
        estimator=pipeline,
        X=X_train,
        y=y_train,
        train_sizes=np.linspace(0.1, 1.0, 10),
        cv=10,
        random_state=1)
    train_mean, train_std = np.mean(train_scores, axis=1), np.std(train_scores,
                                                                  axis=1)
    test_mean, test_std = np.mean(test_scores, axis=1), np.std(test_scores,
                                                               axis=1)
    plt.plot(train_sizes, train_mean, label='training accuracy', marker='o')
    plt.fill_between(train_sizes,
                     train_mean + train_std,
                     train_mean - train_std,
                     alpha=0.25)
    plt.plot(train_sizes, test_mean, label='test accuracy', marker='o')
    plt.fill_between(train_sizes,
                     test_mean + test_std,
                     test_mean - test_std,
                     alpha=0.25)
    plt.grid()
    plt.ylim(top=1.0)
    plt.title('learning curve')
    plt.legend(loc='upper right')
    plt.xlabel('number of training samples')
    plt.ylabel('accuracy')
    plt.show()

    # show validation curve
    params = [10**n for n in range(-3, 3)]
    train_scores, test_scores = validation_curve(
        estimator=pipeline,
        X=X_train,
        y=y_train,
        param_name='logisticregression__C',
        param_range=params,
        cv=10)
    train_mean, train_std = np.mean(train_scores, axis=1), np.std(train_scores,
                                                                  axis=1)
    test_mean, test_std = np.mean(test_scores, axis=1), np.std(test_scores,
                                                               axis=1)
    plt.plot(params, train_mean, label='training accuracy', marker='o')
    plt.fill_between(params,
                     train_mean + train_std,
                     train_mean - train_std,
                     alpha=0.25)
    plt.plot(params, test_mean, label='test accuracy', marker='o')
    plt.fill_between(params,
                     test_mean + test_std,
                     test_mean - test_std,
                     alpha=0.25)
    plt.grid()
    plt.xscale('log')
    plt.ylim(top=1.0)
    plt.title('validation curve')
    plt.legend(loc='upper right')
    plt.xlabel('C')
    plt.ylabel('accuracy')
    plt.show()

    # compute the final score
    C = params[np.argmax(test_mean)]
    pipeline = make_pipeline(
        StandardScaler(), PCA(n_components=2),
        LogisticRegression(C=C, solver='liblinear', random_state=1))
    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_test, y_test)
    print('final score:', score)