def show_confusion_matrix(): # prepare sample data and target variable labels = None features = None D = BreastCancerData(features, labels) X, y = D.X, D.y # split sample data into training data and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) # make and fit pipeline pipeline = make_pipeline(StandardScaler(), SVC(random_state=1)) pipeline.fit(X_train, y_train) # compute confusion matrix y_pred = pipeline.predict(X_test) cm = confusion_matrix(y_test, y_pred) # visualize confusion matrix _, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(cm, cmap=plt.cm.Blues, alpha=0.3) for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): ax.text(x=i, y=j, s=cm[i, j], va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.tight_layout() plt.show()
def show_roc_curve(): # prepare sample data and target variable labels = None features = None D = BreastCancerData(features, labels) X, y = D.X, D.y # split sample data into training data and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) # make and pipeline pipeline = make_pipeline( StandardScaler(), PCA(n_components=2), LogisticRegression(solver='liblinear', C=100, random_state=1)) # extract features for ROC curve X_train_extracted = X_train[:, [4, 14]] mean_fpr = np.linspace(0, 1, 100) mean_tpr_list = [] kfold = StratifiedKFold(n_splits=3, random_state=1) for i, (idx_train, _) in enumerate(kfold.split(X_train, y_train), start=1): # compute fpr (false positive rate) and tpr (true positive rate) probas = pipeline.fit(X_train_extracted[idx_train], y_train[idx_train]).predict_proba( X_train_extracted[idx_train]) fpr, tpr, _ = roc_curve(y_train[idx_train], probas[:, 1], pos_label=1) # save interpolation of tpr at fpr in order to compute mean tpr mean_tpr_list.append(scipy.interp(mean_fpr, fpr, tpr)) # plot ROC curve of the current training datasets plt.plot(fpr, tpr, label='ROC fold {0} (AUC = {1:f})'.format(i, auc(fpr, tpr))) # plot mean ROC curve mean_tpr = np.mean(mean_tpr_list, axis=0) mean_tpr[0], mean_tpr[-1] = 0, 1 plt.plot(mean_fpr, mean_tpr, 'k--', label='mean ROC (AUC = {0:f})'.format(auc(mean_fpr, mean_tpr))) # plot random guess and perfect estimator plt.plot([0, 1], [0, 1], linestyle='--') plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black') # set plot area and show all the plots plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.legend(loc='lower right') plt.show()
def main(): # prepare sample data and target variable labels = None features = None D = BreastCancerData(features, labels) X, y = D.X, D.y # split sample data into training data and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) # make and pipeline pipeline = make_pipeline( StandardScaler(), PCA(n_components=2), LogisticRegression(solver='liblinear', random_state=1)) # explicitly execute stratified k-fold cross validation kfold = StratifiedKFold(n_splits=10, random_state=1) scores = [] for k, (idx_train, idx_test) in enumerate(kfold.split(X_train, y_train), start=1): pipeline.fit(X_train[idx_train], y_train[idx_train]) score = pipeline.score(X_train[idx_test], y_train[idx_test]) scores.append(score) print('fold: {0:2d} | class distribution: {1} | score: {2:f}'.format( k, np.bincount(y_train[idx_train]) / len(y_train[idx_train]), score)) print('CV accuracy: {0:f} +/- {1:f}'.format(np.mean(scores), np.std(scores))) # use cross_val_score function for cross validation scores = cross_val_score(estimator=pipeline, X=X_train, y=y_train, cv=10, n_jobs=1) for k, score in enumerate(scores, start=1): print('fold: {0:2d} | score: {1:f}'.format(k, score)) print('CV accuracy: {0:f} +/- {1:f}'.format(np.mean(scores), np.std(scores))) # compute the final score pipeline.fit(X_train, y_train) score = pipeline.score(X_test, y_test) print('final score:', score)
def main(): # prepare sample data and target variable labels = None features = None D = BreastCancerData(features, labels) X, y = D.X, D.y # split sample data into training data and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) # make pipeline pipeline = make_pipeline( StandardScaler(), SVC(random_state=1)) # execute grid search param_range = list(10**n for n in range(-4, 4)) param_grid = [ {'svc__C':param_range, 'svc__kernel':['linear']}, {'svc__C':param_range, 'svc__gamma':param_range, 'svc__kernel':['rbf']}] grid_search = GridSearchCV( estimator=pipeline, param_grid=param_grid, scoring='accuracy', cv=10) grid_search = grid_search.fit(X_train, y_train) print('best score:', grid_search.best_score_) print('best parameters:', grid_search.best_params_) # compute the final score estimator = grid_search.best_estimator_.fit(X_train, y_train) score = estimator.score(X_test, y_test) print('final score:', score) # execute nested cross validation (5x2 cross validation) grid_search = GridSearchCV( estimator=pipeline, param_grid=param_grid, scoring='accuracy', cv=2) scores = cross_val_score(grid_search, X_train, y_train, scoring='accuracy', cv=5) print('CV accuracy (SVM): {0:f} +/- {1:f}'.format(np.mean(scores), np.std(scores))) grid_search = GridSearchCV( estimator=DecisionTreeClassifier(random_state=1), param_grid=[{'max_depth':[1, 2, 3, 4, 5, 6, 7, None]}], scoring='accuracy', cv=2) scores = cross_val_score(grid_search, X_train, y_train, scoring='accuracy', cv=5) print('CV accuracy (decision tree): {0:f} +/- {1:f}'.format(np.mean(scores), np.std(scores)))
def main(): # prepare sample data and target variable labels = None features = None D = BreastCancerData(features, labels) X, y = D.X, D.y # split sample data into training data and test data X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) # make and pipeline pipeline = make_pipeline( StandardScaler(), PCA(n_components=2), LogisticRegression(solver='liblinear', C=100, random_state=1)) # extract features and compute scores for ROC curve X_train_extracted = X_train[:, [4, 14]] probas = pipeline.fit(X_train_extracted, y_train).predict_proba(X_train_extracted) roc_functions = (metric_utility.roc_curve, roc_curve) auc_functions = (metric_utility.auc, auc) for i, (roc_func, auc_func) in enumerate(zip(roc_functions, auc_functions), start=1): # compute fpr (false positive rate) and tpr (true positive rate) fpr, tpr, _ = roc_func(y_train, probas[:, 1], pos_label=1) # plot ROC curve plt.plot(fpr, tpr, label='ROC {0} (AUC = {1:f})'.format(i, auc_func(fpr, tpr))) # plot random guess and perfect estimator plt.plot([0, 1], [0, 1], linestyle='--') plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black') # set plot area and show all the plots plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.legend(loc='lower right') plt.show()
def show_evaluation_scores(): # prepare sample data and target variable labels = None features = None D = BreastCancerData(features, labels) X, y = D.X, D.y # split sample data into training data and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) # make and fit pipeline pipeline = make_pipeline(StandardScaler(), SVC(random_state=1)) pipeline.fit(X_train, y_train) # compute evaluation scores y_pred = pipeline.predict(X_test) print('precision score:', precision_score(y_test, y_pred)) print('recall score:', recall_score(y_test, y_pred)) print('f1 score:', f1_score(y_test, y_pred)) # execute grid search with a custom evaluation score scorer = make_scorer(f1_score) param_range = list(10**n for n in range(-4, 4)) param_grid = [{ 'svc__C': param_range, 'svc__kernel': ['linear'] }, { 'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf'] }] grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scorer, cv=10) grid_search = grid_search.fit(X_train, y_train) print('best score:', grid_search.best_score_) print('best parameters:', grid_search.best_params_)
def main(): # prepare sample data and target variable labels = None features = None D = BreastCancerData(features, labels) X, y = D.X, D.y # split sample data into training data and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) # make and fit pipeline pipeline = make_pipeline( StandardScaler(), PCA(n_components=2), LogisticRegression(solver='liblinear', random_state=1)) pipeline.fit(X_train, y_train) # show accuracy y_pred = pipeline.predict(X_test) print('misclassified samples: {}'.format(np.sum(y_test != y_pred)))
def main(): # prepare sample data and target variable labels = None features = None D = BreastCancerData(features, labels) X, y = D.X, D.y # split sample data into training data and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) # make and pipeline pipeline = make_pipeline( StandardScaler(), PCA(n_components=2), LogisticRegression(solver='liblinear', random_state=1)) # show learning curve train_sizes, train_scores, test_scores = learning_curve( estimator=pipeline, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, random_state=1) train_mean, train_std = np.mean(train_scores, axis=1), np.std(train_scores, axis=1) test_mean, test_std = np.mean(test_scores, axis=1), np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, label='training accuracy', marker='o') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.25) plt.plot(train_sizes, test_mean, label='test accuracy', marker='o') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.25) plt.grid() plt.ylim(top=1.0) plt.title('learning curve') plt.legend(loc='upper right') plt.xlabel('number of training samples') plt.ylabel('accuracy') plt.show() # show validation curve params = [10**n for n in range(-3, 3)] train_scores, test_scores = validation_curve( estimator=pipeline, X=X_train, y=y_train, param_name='logisticregression__C', param_range=params, cv=10) train_mean, train_std = np.mean(train_scores, axis=1), np.std(train_scores, axis=1) test_mean, test_std = np.mean(test_scores, axis=1), np.std(test_scores, axis=1) plt.plot(params, train_mean, label='training accuracy', marker='o') plt.fill_between(params, train_mean + train_std, train_mean - train_std, alpha=0.25) plt.plot(params, test_mean, label='test accuracy', marker='o') plt.fill_between(params, test_mean + test_std, test_mean - test_std, alpha=0.25) plt.grid() plt.xscale('log') plt.ylim(top=1.0) plt.title('validation curve') plt.legend(loc='upper right') plt.xlabel('C') plt.ylabel('accuracy') plt.show() # compute the final score C = params[np.argmax(test_mean)] pipeline = make_pipeline( StandardScaler(), PCA(n_components=2), LogisticRegression(C=C, solver='liblinear', random_state=1)) pipeline.fit(X_train, y_train) score = pipeline.score(X_test, y_test) print('final score:', score)