def main(): # prepare sample data and target variable labels = ['setosa', 'versicolor'] features = ['petal length (cm)', 'petal width (cm)'] D = IrisData(features, labels) X = D.X y = D.y # split sample data into training data and test data and standardize them X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) sc = StandardScaler().fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) # fit classifiers classifier = LogisticRegressionGD(eta=0.05, n_iter=1000, random_state=1).fit(X_train_std, y_train) # show accuracy y_pred = classifier.predict(X_test_std) print('misclassified samples: {}'.format(np.sum(y_test != y_pred))) # show history of costs plot_update_history(classifier) # show decision regions plot_decision_regions(X_train_std, y_train, classifier=classifier)
def main(): # prepare sample data and target variable labels = ['setosa', 'versicolor', 'virginica'] features = ['petal length (cm)', 'petal width (cm)'] D = IrisData(features, labels) X = D.X y = D.y # split sample data into training data and test data and standardize them X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) sc = StandardScaler().fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) # combine training data and test data X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined = np.hstack((y_train, y_test)) # fit classifiers classifier = LogisticRegression(C=100.0, random_state=1, solver='liblinear', multi_class='ovr').fit( X_train_std, y_train) # show accuracy y_pred = classifier.predict(X_test_std) print('misclassified samples: {}'.format(np.sum(y_test != y_pred))) # show decision regions plot_decision_regions(X_combined_std, y_combined, classifier=classifier, test_idx=list(range(len(y_train), len(y)))) # show effect of regularization parameter weights = [] params = [] for i in np.arange(-5, 5): C = 10.0**i classifier = LogisticRegression(C=C, random_state=1, solver='liblinear', multi_class='ovr').fit( X_train_std, y_train) weights.append(classifier.coef_[1]) params.append(C) weights = np.array(weights) plt.axhline(y=0, linewidth=1, linestyle='--', color='k') plt.plot(params, weights[:, 0], label='petal length') plt.plot(params, weights[:, 1], linestyle='--', label='petal width') plt.xscale('log') plt.xlabel('C') plt.ylabel('weight coefficient') plt.show()
def main(): # prepare sample data and target variable labels = ['setosa', 'versicolor', 'virginica'] features = ['petal length (cm)', 'petal width (cm)'] D = IrisData(features, labels) X = D.X y = D.y # split sample data into training data and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) # combine training data and test data X_combined = np.vstack((X_train, X_test)) y_combined = np.hstack((y_train, y_test)) # fit classifiers classifiers = [ RandomForestClassifier(criterion='gini', n_estimators=10, random_state=1, n_jobs=2).fit(X_train, y_train), RandomForestClassifier(criterion='gini', n_estimators=25, random_state=1, n_jobs=2).fit(X_train, y_train), RandomForestClassifier(criterion='gini', n_estimators=50, random_state=1, n_jobs=2).fit(X_train, y_train), RandomForestClassifier(criterion='gini', n_estimators=100, random_state=1, n_jobs=2).fit(X_train, y_train), RandomForestClassifier(criterion='entropy', n_estimators=100, random_state=1, n_jobs=2).fit(X_train, y_train) ] for classifier in classifiers: # show accuracy y_pred = classifier.predict(X_test) print('misclassified samples: {}'.format(np.sum(y_test != y_pred))) # show decision regions plot_decision_regions(X_combined, y_combined, classifier=classifier, test_idx=list(range(len(y_train), len(y_test))))
def main(): # prepare sample data and target variable features = ['alcohol', 'od280/od315_of_diluted_wines'] labels = ['class_1', 'class_2'] wine_data = WineData(features, labels) X = wine_data.X y = wine_data.y # split sample data into training data and test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) X_combined = np.vstack((X_train, X_test)) y_combined = np.hstack((y_train, y_test)) # fit classifiers decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1) bagging = BaggingClassifier(base_estimator=decision_tree, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1) classifiers = [decision_tree, bagging] for classifier in classifiers: classifier.fit(X_train, y_train) names = ['decision tree', 'bagging'] for classifier, name in zip(classifiers, names): # show score print('[{name}]'.format(name=name)) print('training score:', classifier.score(X_train, y_train)) print('test score:', classifier.score(X_test, y_test)) # show decision regions plot_decision_regions(X_combined, y_combined, classifier=classifier, test_idx=list(range(len(y_train), len(y))), title=name)
def main(): # prepare sample data and target variable labels = ['setosa', 'versicolor', 'virginica'] features = ['petal length (cm)', 'petal width (cm)'] D = IrisData(features, labels) X = D.X y = D.y # split sample data into training data and test data and standardize them X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) sc = StandardScaler().fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) # combine training data and test data X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined = np.hstack((y_train, y_test)) # fit classifiers classifiers = [ KNeighborsClassifier(n_neighbors=4, p=2, metric='minkowski').fit(X_train_std, y_train), KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski').fit(X_train_std, y_train), KNeighborsClassifier(n_neighbors=6, p=2, metric='minkowski').fit(X_train_std, y_train), KNeighborsClassifier(n_neighbors=5, p=1, metric='minkowski').fit(X_train_std, y_train) ] for classifier in classifiers: # show accuracy y_pred = classifier.predict(X_test_std) print('misclassified samples: {}'.format(np.sum(y_test != y_pred))) # show decision regions plot_decision_regions(X_combined_std, y_combined, classifier=classifier, test_idx=list(range(len(y_train), len(y))))
def main(): # prepare sample data and target variable wine_data = WineData() X = wine_data.X y = wine_data.y # split sample data into training data and test data and standardize them X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y) sc = StandardScaler().fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) lda_transformers = [ lda.LDA(n_components=2), LinearDiscriminantAnalysis(n_components=2) ] for lda_transformer in lda_transformers: # execute LDA X_train_lda = lda_transformer.fit_transform(X_train_std, y_train) # show coefficients and explained variance ratio print('coefficients:\n', lda_transformer.coef_) print('explained variance ratio:', lda_transformer.explained_variance_ratio_) plot_features(X_train_lda, y_train, xlabel='LD1', ylabel='LD2') # fit classifier and plot decigion regions classifier = LogisticRegression(C=100.0, random_state=1, solver='liblinear', multi_class='ovr').fit( X_train_lda, y_train) X_test_lda = lda_transformer.transform(X_test_std) print('score: ', classifier.score(X_test_lda, y_test)) plot_decision_regions(X_test_lda, y_test, classifier=classifier, xlabel='LD1', ylabel='LD2')
def main(): # prepare training data and target variable features = ['sepal length (cm)', 'petal length (cm)'] labels = ['setosa', 'versicolor'] D = IrisData(features, labels) X = D.X y = np.where(D.y == 0, -1, 1) # fit perceptron classifier = Perceptron(eta=0.1, n_iter=10) classifier.fit(X, y) # show history of errors plot_update_history(classifier) # show decision regions plot_decision_regions(X, y, classifier=classifier, xlabel='sepal length [cm]', ylabel='petal lnegth [cm]')
def main(): # prepare sample data and target variable wine_data = WineData() X = wine_data.X y = wine_data.y # split sample data into training data and test data and standardize them X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y) sc = StandardScaler().fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) pca_transformers = [pca.PCA(n_components=2), PCA(n_components=2)] for pca_transformer in pca_transformers: # execute PCA X_train_pca = pca_transformer.fit_transform(X_train_std) # show principal components and explained variance print('principal components:\n', pca_transformer.components_) print('explained variance:', pca_transformer.explained_variance_) plot_features(X_train_pca, y_train, xlabel='PC1', ylabel='PC2') # fit classifier and plot decigion regions classifier = LogisticRegression(C=100.0, random_state=1, solver='liblinear', multi_class='ovr').fit( X_train_pca, y_train) X_test_pca = pca_transformer.transform(X_test_std) print('score: ', classifier.score(X_test_pca, y_test)) plot_decision_regions(X_test_pca, y_test, classifier=classifier, xlabel='PC1', ylabel='PC2')
def main(): # prepare training data and target variable features = ['sepal length (cm)', 'petal length (cm)'] labels = ['setosa', 'versicolor'] D = IrisData(features, labels) X = D.X y = np.where(D.y == 0, -1, 1) # standardize training data X_std = np.copy(X) for i in range(len(labels)): X_std[:, i] = (X[:, i] - X[:, i].mean()) / X[:, i].std() # fit classifiers classifiers = [ AdalineGD(eta=0.01, n_iter=10).fit(X, y), AdalineGD(eta=0.0001, n_iter=10).fit(X, y), AdalineGD(eta=0.01, n_iter=15).fit(X_std, y), AdalineSGD(eta=0.01, n_iter=15).fit(X_std, y) ] # show history of costs for classifier in classifiers: plot_update_history(classifier) # show decision regions plot_decision_regions(X_std, y, classifier=classifiers[2], xlabel='sepal length [standardized]', ylabel='petal lnegth [standardized]') plot_decision_regions(X_std, y, classifier=classifiers[3], xlabel='sepal length [standardized]', ylabel='petal lnegth [standardized]')
def main(): # prepare sample data and target variable labels = ['versicolor', 'virginica'] features = ['sepal width (cm)', 'petal length (cm)'] D = IrisData(features, labels) X = D.X y = D.y # split sample data into training data and test data and standardize them X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1, stratify=y) sc = StandardScaler().fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) # combine training data and test data X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined = np.hstack((y_train, y_test)) # prepare classifiers logistic_regression = LogisticRegression(penalty='l2', solver='liblinear', C=0.001, random_state=1) decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=0) knn = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski') majority_vote = MajorityVoteClassifier( classifiers=[logistic_regression, decision_tree, knn]) classifiers = [logistic_regression, decision_tree, knn, majority_vote] classifier_names = [ 'logistic regression', 'decision tree', 'KNN', 'majority vote' ] # compute cross validation score of classifiers for classifier, name in zip(classifiers, classifier_names): scores = cross_val_score(estimator=classifier, X=X_train_std, y=y_train, cv=10, scoring='accuracy') print('accuracy : {mean:f} +/- {std:f} ({name})'.format( mean=np.mean(scores), std=np.std(scores), name=name)) # execute grid search param_grid = { 'decisiontreeclassifier__max_depth': [1, 2], 'logisticregression__C': [0.001, 0.1, 100.0] } grid = GridSearchCV(estimator=majority_vote, param_grid=param_grid, cv=10, scoring='accuracy').fit(X_train_std, y_train) for mean, std, params in zip(grid.cv_results_['mean_test_score'], grid.cv_results_['std_test_score'], grid.cv_results_['params']): print('accuracy: {mean:f} +/- {std:f} with {params}'.format( mean=mean, std=std, params=params)) # plot ROC curves pos_label_index = 1 for classifier, name in zip(classifiers, classifier_names): y_pred = classifier.fit( X_train_std, y_train).predict_proba(X_test_std)[:, pos_label_index] fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=pos_label_index + 1) plt.plot(fpr, tpr, label='{name} (AUC = {auc:f})'.format(name=name, auc=auc(fpr, tpr))) plt.grid(alpha=0.5) plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.legend(loc='lower right') plt.show() # plot decision regions for classifier in classifiers: classifier.fit(X_train_std, y_train) plot_decision_regions(X_combined_std, y_combined, classifier=classifier, test_idx=list(range(len(y_train), len(y))))