Exemplo n.º 1
0
def gradient_boosting():
    
    X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8, cluster_std = 1.3, random_state = 4)
    y_D2 = y_D2 % 2
    
    X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state = 0)
    fig, subaxes = plt.subplots(1, 1, figsize=(6, 6))

    clf = GradientBoostingClassifier().fit(X_train, y_train)
    title = 'GBDT, complex binary dataset, default settings'
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train, X_test, y_test, title, subaxes)
    plt.show()

    fruits = pd.read_table('fruit_data_with_colors.txt')
    feature_names_fruits = ['height', 'width', 'mass', 'color_score']
    X_fruits = fruits[feature_names_fruits]
    y_fruits = fruits['fruit_label']
    target_names_fruits = ['apple', 'mandarin', 'orange', 'lemon']
    
    X_train, X_test, y_train, y_test = train_test_split(X_fruits.as_matrix(), y_fruits.as_matrix(), random_state = 0)
    fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))
    
    pair_list = [[0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]
    for pair, axis in zip(pair_list, subaxes):
        X = X_train[:, pair]
        y = y_train
        clf = GradientBoostingClassifier().fit(X, y)
        plot_class_regions_for_classifier_subplot(clf, X, y, None, None, title, axis, target_names_fruits)
        axis.set_xlabel(feature_names_fruits[pair[0]])
        axis.set_ylabel(feature_names_fruits[pair[1]])
    
    plt.tight_layout()
    plt.show()
    clf = GradientBoostingClassifier().fit(X_train, y_train)

    print('GBDT, Fruit dataset, default settings')
    print('Accuracy of GBDT classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
    print('Accuracy of GBDT classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

    cancer = load_breast_cancer()
    (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)    
    X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)

    clf = GradientBoostingClassifier(random_state = 0)
    clf.fit(X_train, y_train)
    print('Breast cancer dataset (learning_rate=0.1, max_depth=3)')
    print('Accuracy of GBDT classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
    print('Accuracy of GBDT classifier on test set: {:.2f}\n'.format(clf.score(X_test, y_test)))

    clf = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 2, random_state = 0)
    clf.fit(X_train, y_train)

    print('Breast cancer dataset (learning_rate=0.01, max_depth=2)')
    print('Accuracy of GBDT classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
    print('Accuracy of GBDT classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))
Exemplo n.º 2
0
def two_feature_classification():
    
    dataset = load_digits()
    X, y = dataset.data, dataset.target == 1
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Create a two-feature input vector matching the example plot above
    # We jitter the points (add a small amount of random noise) in case there are areas
    # in feature space where many instances have the same features.
    jitter_delta = 0.25
    X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta
    X_twovar_test  = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta

    clf = SVC(kernel = 'linear').fit(X_twovar_train, y_train)
    grid_values = {'class_weight':['balanced', {1:2},{1:3},{1:4},{1:5},{1:10},{1:20},{1:50}]}
    plt.figure(figsize=(9,6))
    for i, eval_metric in enumerate(('precision','recall', 'f1','roc_auc')):
        grid_clf_custom = GridSearchCV(clf, param_grid=grid_values, scoring=eval_metric)
        grid_clf_custom.fit(X_twovar_train, y_train)
        print('Grid best parameter (max. {0}): {1}'.format(eval_metric, grid_clf_custom.best_params_))
        print('Grid best score ({0}): {1}'.format(eval_metric, grid_clf_custom.best_score_))
        plt.subplots_adjust(wspace=0.3, hspace=0.3)
        plot_class_regions_for_classifier_subplot(grid_clf_custom, X_twovar_test, y_test, None, None, None,  plt.subplot(2, 2, i+1))
        plt.title(eval_metric+'-oriented SVC')
        plt.tight_layout()
    plt.show()
    
    y_scores = clf.decision_function(X_twovar_test)
    precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
    closest_zero = np.argmin(np.abs(thresholds))
    closest_zero_p = precision[closest_zero]
    closest_zero_r = recall[closest_zero]

    plot_class_regions_for_classifier(clf, X_twovar_test, y_test)
    plt.title("SVC, class_weight = 'balanced', optimized for accuracy")
    plt.show()

    plt.figure()
    plt.xlim([0.0, 1.01])
    plt.ylim([0.0, 1.01])
    plt.title ("Precision-recall curve: SVC, class_weight = 'balanced'")
    plt.plot(precision, recall, label = 'Precision-Recall Curve')
    plt.plot(closest_zero_p, closest_zero_r, 'o', markersize=12, fillstyle='none', c='r', mew=3)
    plt.xlabel('Precision', fontsize=16)
    plt.ylabel('Recall', fontsize=16)
    plt.axes().set_aspect('equal')
    plt.show()
    print('At zero threshold, precision: {:.2f}, recall: {:.2f}'.format(closest_zero_p, closest_zero_r))
Exemplo n.º 3
0
def svm():

    X_C2, y_C2 = make_classification(n_samples=100,
                                     n_features=2,
                                     n_redundant=0,
                                     n_informative=2,
                                     n_clusters_per_class=1,
                                     flip_y=0.1,
                                     class_sep=0.5,
                                     random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X_C2,
                                                        y_C2,
                                                        random_state=0)

    fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
    this_C = 1.0
    clf = SVC(kernel='linear', C=this_C).fit(X_train, y_train)
    title = 'Linear SVC, C = {:.3f}'.format(this_C)
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None,
                                              None, title, subaxes)

    X_train, X_test, y_train, y_test = train_test_split(X_C2,
                                                        y_C2,
                                                        random_state=0)
    fig, subaxes = plt.subplots(1, 2, figsize=(8, 4))

    for this_C, subplot in zip([0.00001, 100], subaxes):
        clf = LinearSVC(C=this_C).fit(X_train, y_train)
        title = 'Linear SVC, C = {:.5f}'.format(this_C)
        plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None,
                                                  None, title, subplot)
    plt.tight_layout()

    cancer = load_breast_cancer()
    X_cancer, y_cancer = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X_cancer,
                                                        y_cancer,
                                                        random_state=0)
    clf = LinearSVC().fit(X_train, y_train)
    print('Breast cancer dataset')
    print('Accuracy of Linear SVC classifier on training set: {:.2f}'.format(
        clf.score(X_train, y_train)))
    print('Accuracy of Linear SVC classifier on test set: {:.2f}'.format(
        clf.score(X_test, y_test)))
Exemplo n.º 4
0
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

fig, subaxes = plt.subplots(4, 1, figsize=(6, 23))
for this_alpha, axis in zip([0.01, 0.1, 1.0, 5.0], subaxes):
    nnclf = MLPClassifier(solver='lbfgs',
                          activation='tanh',
                          alpha=this_alpha,
                          hidden_layer_sizes=[100, 100],
                          random_state=0)
    nnclf.fit(X_train, y_train)
    title = 'this is the title'.format(this_alpha)
    plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train, X_test,
                                              y_test, title, axis)
    plt.tight_layout()
Exemplo n.º 5
0
# ## Ensembles of Decision Trees

# ### Random forests

# In[ ]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)
fig, subaxes = plt.subplots(1, 1, figsize=(6, 6))

clf = RandomForestClassifier().fit(X_train, y_train)
title = 'Random Forest Classifier, complex binary dataset, default settings'
plot_class_regions_for_classifier_subplot(clf, X_train, y_train, X_test,
                                          y_test, title, subaxes)

plt.show()

# ### Random forest: Fruit dataset

# In[ ]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

X_train, X_test, y_train, y_test = train_test_split(X_fruits.as_matrix(),
                                                    y_fruits.as_matrix(),
                                                    random_state=0)
fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))
Exemplo n.º 6
0
#Gamma controls how much distance between points of the same class matters the depict the regions of classification. Higher values of Gamma involves the regions to be more precise with observation points.
#C controls the amount of regularization of the SVM model. This parameter is inversely proportional to the classifier margin. Thus, as long as the C increases the model regularizes less and try to fit better the data points.

#Creating the plots for each iteration.
fig, subaxes = plt.subplots(3, 4, figsize=(15, 10), dpi=50)

#Training iteratively over sets of values for parameters Gamma, and C.
#The test scores (accuracy) for each combination is calculated as well as the classification regions of each model.
for this_gamma, this_axis in zip([0.01, 1, 5], subaxes):

    for this_C, subplot in zip([0.1, 1, 15, 250], this_axis):
        title = 'gamma = {:.2f}, C = {:.2f}'.format(this_gamma, this_C)
        clf = SVC(kernel='rbf', gamma=this_gamma,
                  C=this_C).fit(X_train, y_train)
        plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
                                                  X_test, y_test, title,
                                                  subplot)
        plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

#5.Working over a real dataframe.
#Applying KSVM to a real life data set.
#df of the brest and cancer dataframe.
df = pd.read_csv('breast_cancer_dataset.csv')

print(df.head(3))
print("df shape :" + str(df.shape))

#5.1. The dataset is downloaded from scikit learn repositories. Categorical columns have been removed from the dataframe for exemplification proposals.
#The dataframe is converted then into a numpy array, and into scientific notation.
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y=True)
Exemplo n.º 7
0
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    random_state=0)
fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))

pair_list = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]
tree_max_depth = 4

for pair, axis in zip(pair_list, subaxes):
    X = X_train[:, pair]
    y = y_train

    clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y)
    title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth)
    plot_class_regions_for_classifier_subplot(clf, X, y, None, None, title,
                                              axis, iris.target_names)

    axis.set_xlabel(iris.feature_names[pair[0]])
    axis.set_ylabel(iris.feature_names[pair[1]])

plt.tight_layout()
plt.show()

# Decision trees on a real-world dataset
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from adspy_shared_utilities import plot_feature_importances

X_train, X_test, y_train, y_test = train_test_split(X_cancer,
                                                    y_cancer,
                                                    random_state=0)
Exemplo n.º 8
0
from sklearn.svm import SVC
from adspy_shared_utilities import plot_class_regions_for_classifier_subplot

X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, random_state=0)

fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
this_C = 1.0
clf = SVC(kernel='linear', C=this_C).fit(X_train, y_train)
title = 'Linear SVC, C = {:.3f}'.format(this_C)
plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None, None,
                                          title, subaxes)
Exemplo n.º 9
0
from sklearn.linear_model import LogisticRegression
from adspy_shared_utilities import (plot_class_regions_for_classifier_subplot)

fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
y_fruits_apple = y_fruits_2d == 1  # make into a binary problem: apples vs everything else
X_train, X_test, y_train, y_test = (train_test_split(
    X_fruits_2d.as_matrix(), y_fruits_apple.as_matrix(), random_state=0))

clf = LogisticRegression(C=100).fit(X_train, y_train)
plot_class_regions_for_classifier_subplot(
    clf, X_train, y_train, None, None, 'Logistic regression \
for binary classification\nFruit dataset: Apple vs others', subaxes)

h = 6
w = 8
print('A fruit with height {} and width {} is predicted to be: {}'.format(
    h, w, ['not an apple', 'an apple'][clf.predict([[h, w]])[0]]))

h = 10
w = 7
print('A fruit with height {} and width {} is predicted to be: {}'.format(
    h, w, ['not an apple', 'an apple'][clf.predict([[h, w]])[0]]))
subaxes.set_xlabel('height')
subaxes.set_ylabel('width')

print('Accuracy of Logistic regression classifier on training set: {:.2f}'.
      format(clf.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'.format(
    clf.score(X_test, y_test)))
#binary classification
plt.figure()
plt.title('Binary Classification')
X_C1, y_C1 = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_clusters_per_class=1, flip_y=0.01, class_sep=0.5, random_state=0)
plt.scatter(X_C1[:, 0], X_C1[:, 1], c=y_C1, marker='o', s=50)  # Number of informative, redundant and repeated features must sum to less than the number of total features
#plt.show()

#Linear Support Vector Machine
#plt.figure()
X_train, X_test, y_train, y_test = train_test_split(X_C1, y_C1, random_state=0)
fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
c=1.0
svc = SVC(C=c, kernel='linear').fit(X_train, y_train)
lsvc = LinearSVC(C=c).fit(X_train, y_train)
title = 'SVC with Linear Kernel and C = {}'.format(c)
utility.plot_class_regions_for_classifier_subplot(svc, X_train, y_train, None, None, title, subaxes)
#plt.show()
#plt.figure()
title = 'Linear SVC with C = {}'.format(c)
utility.plot_class_regions_for_classifier_subplot(lsvc, X_train, y_train, None, None, title, subaxes)
#plt.show()

#Difference between the above two functions
#https://stackoverflow.com/questions/35076586/linearsvc-vs-svckernel-linear-conflicting-arguments

#Multiclass classification
fruits = pd.read_table('fruit_data_with_colors.txt')

feature_names = ['height', 'width', 'mass', 'color_score']
target = ['fruit_label']
target_names_fruits = ['mandarin', 'lemon', 'apple', 'orange']
Exemplo n.º 11
0
print('Coef: '.format(polyreglin.coef_))
print('Intercept: ', polyreglin.intercept_)
print('Train score: ', polyreglin.score(X_train, y_train))
print('Test score: ', polyreglin.score(X_test, y_test))

print('Ridge Polynomial Regression ')
print('Coef: ', polyregrid.coef_)
print('Intercept: ', polyregrid.intercept_)
print('Train score: ', polyregrid.score(X_train, y_train))
print('Test score: ', polyregrid.score(X_test, y_test))

#logistic regression for fruits data set
fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))

X_fruits_2d = fruits[['height', 'width']]
y_fruits_2d = fruits['fruit_label']
y_apple = y_fruits_2d == 1 #choosing all apples

X_train, X_test, y_train, y_test = train_test_split(X_fruits_2d.as_matrix(), y_apple.as_matrix(), random_state=0)
logistic = LogisticRegression(C=100).fit(X_train, y_train)
plot_class_regions_for_classifier_subplot(logistic, X_train, y_train, None, None, 'Logistic Regression for Apples v/s Other Fruits', subaxes)
print('Fruit with height 6 width 8 is classified as ', ['apple', 'not an apple'][logistic.predict([[6, 8]])[0]])
print('Accuracy on training data {} \nAccuracy on test data {}'.format(logistic.score(X_train,y_train), logistic.score(X_test, y_test)))

#logistic regression on classification simple data set
X_train, X_test, y_train, y_test = train_test_split(X_C1, y_C1, random_state=0)
logistic_simple = LogisticRegression().fit(X_train, y_train)
plot_class_regions_for_classifier_subplot(logistic, X_train, y_train, None, None, 'Logistic Regression for Simple Data Set', subaxes)
print('Accuracy for training {}\nAccuracy for test {}'.format(logistic_simple.score(X_train, y_train), logistic_simple.score(X_test, y_test)))

plt.show()
Exemplo n.º 12
0
def neural_network():
    
    #Synthetic dataset 1: single hidden layer
    X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8, cluster_std = 1.3, random_state = 4)
    y_D2 = y_D2 % 2

    X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)
    fig, subaxes = plt.subplots(3, 1, figsize=(6,18))
    for units, axis in zip([1, 10, 100], subaxes):
        nnclf = MLPClassifier(hidden_layer_sizes = [units], solver='lbfgs', random_state = 0).fit(X_train, y_train)    
        title = 'Dataset 1: Neural net classifier, 1 layer, {} units'.format(units)
        plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train, X_test, y_test, title, axis)
        plt.tight_layout()
    
    # Synthetic dataset 1: two hidden layers
    nnclf = MLPClassifier(hidden_layer_sizes = [10, 10], solver='lbfgs', random_state = 0).fit(X_train, y_train)
    plot_class_regions_for_classifier(nnclf, X_train, y_train, X_test, y_test, 'Dataset 1: Neural net classifier, 2 layers, 10/10 units')
    
    #Regularization parameter: alpha
    fig, subaxes = plt.subplots(4, 1, figsize=(6, 23))
    for this_alpha, axis in zip([0.01, 0.1, 1.0, 5.0], subaxes):
        nnclf = MLPClassifier(solver='lbfgs', activation = 'tanh', alpha = this_alpha, hidden_layer_sizes = [100, 100], random_state = 0).fit(X_train, y_train)
        title = 'Dataset 2: NN classifier, alpha = {:.3f} '.format(this_alpha)
        plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train, X_test, y_test, title, axis)
        plt.tight_layout()
    
    #The effect of different choices of activation function    
    X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2, random_state=0)
    fig, subaxes = plt.subplots(3, 1, figsize=(6,18))
    for this_activation, axis in zip(['logistic', 'tanh', 'relu'], subaxes):
        nnclf = MLPClassifier(solver='lbfgs', activation = this_activation, alpha = 0.1, hidden_layer_sizes = [10, 10], random_state = 0).fit(X_train, y_train)
        title = 'Dataset 2: NN classifier, 2 layers 10/10, {} activation function'.format(this_activation)
        plot_class_regions_for_classifier_subplot(nnclf, X_train, y_train, X_test, y_test, title, axis)
        plt.tight_layout()
    
    #Neural networks: Regression
    plt.figure()
    plt.title('Sample regression problem with one input variable')
    X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,n_informative=1, bias = 150.0, noise = 30, random_state=0)
    plt.scatter(X_R1, y_R1, marker= 'o', s=50)
    plt.show()
    
    fig, subaxes = plt.subplots(2, 3, figsize=(11,8), dpi=70)
    X_predict_input = np.linspace(-3, 3, 50).reshape(-1,1)
    X_train, X_test, y_train, y_test = train_test_split(X_R1[0::5], y_R1[0::5], random_state = 0)

    for thisaxisrow, thisactivation in zip(subaxes, ['tanh', 'relu']):
        for thisalpha, thisaxis in zip([0.0001, 1.0, 100], thisaxisrow):
            mlpreg = MLPRegressor(hidden_layer_sizes = [100,100], activation = thisactivation, alpha = thisalpha, solver = 'lbfgs').fit(X_train, y_train)
            y_predict_output = mlpreg.predict(X_predict_input)
            thisaxis.set_xlim([-2.5, 0.75])
            thisaxis.plot(X_predict_input, y_predict_output, '^', markersize = 10)
            thisaxis.plot(X_train, y_train, 'o')
            thisaxis.set_xlabel('Input feature')
            thisaxis.set_ylabel('Target value')
            thisaxis.set_title('MLP regression\nalpha={}, activation={})' .format(thisalpha, thisactivation))
            plt.tight_layout()
    
    #Application to real-world dataset for classification
    cancer = load_breast_cancer()
    (X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)  
    scaler = MinMaxScaler()

    X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = MLPClassifier(hidden_layer_sizes = [100, 100], alpha = 5.0, random_state = 0, solver='lbfgs').fit(X_train_scaled, y_train)
    print('Breast cancer dataset')
    print('Accuracy of NN classifier on training set: {:.2f}'.format(clf.score(X_train_scaled, y_train)))
    print('Accuracy of NN classifier on test set: {:.2f}'.format(clf.score(X_test_scaled, y_test)))
Exemplo n.º 13
0
def decisiontree():

    iris = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=3)
    clf = DecisionTreeClassifier().fit(X_train, y_train)

    print(
        'Accuracy of Decision Tree classifier on training set: {:.2f}'.format(
            clf.score(X_train, y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(
        clf.score(X_test, y_test)))

    clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
    print(
        'Accuracy of Decision Tree classifier on training set: {:.2f}'.format(
            clf2.score(X_train, y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(
        clf2.score(X_test, y_test)))

    plot_decision_tree(clf, iris.feature_names, iris.target_names)
    plot_decision_tree(clf2, iris.feature_names, iris.target_names)

    plt.figure(figsize=(10, 4), dpi=80)
    plot_feature_importances(clf, iris.feature_names)
    plt.show()

    print('Feature importances: {}'.format(clf.feature_importances_))

    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=0)
    fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))
    pair_list = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]
    tree_max_depth = 4

    for pair, axis in zip(pair_list, subaxes):
        X = X_train[:, pair]
        y = y_train
        clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y)
        title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth)
        plot_class_regions_for_classifier_subplot(clf, X, y, None, None, title,
                                                  axis, iris.target_names)
        axis.set_xlabel(iris.feature_names[pair[0]])
        axis.set_ylabel(iris.feature_names[pair[1]])

    plt.tight_layout()
    plt.show()

    cancer = load_breast_cancer()
    X_cancer, y_cancer = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X_cancer,
                                                        y_cancer,
                                                        random_state=0)
    clf = DecisionTreeClassifier(max_depth=4,
                                 min_samples_leaf=8,
                                 random_state=0).fit(X_train, y_train)
    plot_decision_tree(clf, cancer.feature_names, cancer.target_names)

    print('Breast cancer dataset: decision tree')
    print('Accuracy of DT classifier on training set: {:.2f}'.format(
        clf.score(X_train, y_train)))
    print('Accuracy of DT classifier on test set: {:.2f}'.format(
        clf.score(X_test, y_test)))

    plt.figure(figsize=(10, 6), dpi=80)
    plot_feature_importances(clf, cancer.feature_names)
    plt.tight_layout()
    plt.show()
Exemplo n.º 14
0
def kernal():

    X_D2, y_D2 = make_blobs(n_samples=100,
                            n_features=2,
                            centers=8,
                            cluster_std=1.3,
                            random_state=4)
    X_train, X_test, y_train, y_test = train_test_split(X_D2,
                                                        y_D2,
                                                        random_state=0)
    plot_class_regions_for_classifier(SVC().fit(X_train, y_train), X_train,
                                      y_train, None, None,
                                      'Support Vector Classifier: RBF kernel')
    plot_class_regions_for_classifier(
        SVC(kernel='poly', degree=3).fit(X_train, y_train), X_train, y_train,
        None, None, 'Support Vector Classifier: Polynomial kernel, degree = 3')

    X_train, X_test, y_train, y_test = train_test_split(X_D2,
                                                        y_D2,
                                                        random_state=0)
    fig, subaxes = plt.subplots(3, 1, figsize=(4, 11))

    for this_gamma, subplot in zip([0.01, 1.0, 10.0], subaxes):
        clf = SVC(kernel='rbf', gamma=this_gamma).fit(X_train, y_train)
        title = 'Support Vector Classifier: \nRBF kernel, gamma = {:.2f}'.format(
            this_gamma)
        plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None,
                                                  None, title, subplot)
        plt.tight_layout()

    X_train, X_test, y_train, y_test = train_test_split(X_D2,
                                                        y_D2,
                                                        random_state=0)
    fig, subaxes = plt.subplots(3, 4, figsize=(15, 10), dpi=50)

    for this_gamma, this_axis in zip([0.01, 1, 5], subaxes):
        for this_C, subplot in zip([0.1, 1, 15, 250], this_axis):
            title = 'gamma = {:.2f}, C = {:.2f}'.format(this_gamma, this_C)
            clf = SVC(kernel='rbf', gamma=this_gamma,
                      C=this_C).fit(X_train, y_train)
            plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
                                                      X_test, y_test, title,
                                                      subplot)
            plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)

    cancer = load_breast_cancer()
    X_cancer, y_cancer = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X_cancer,
                                                        y_cancer,
                                                        random_state=0)
    clf = SVC(C=10).fit(X_train, y_train)

    print('Breast cancer dataset (unnormalized features)')
    print('Accuracy of RBF-kernel SVC on training set: {:.2f}'.format(
        clf.score(X_train, y_train)))
    print('Accuracy of RBF-kernel SVC on test set: {:.2f}'.format(
        clf.score(X_test, y_test)))

    cancer = load_breast_cancer()
    X_cancer, y_cancer = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X_cancer,
                                                        y_cancer,
                                                        random_state=0)
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = SVC(C=10).fit(X_train_scaled, y_train)
    print('Breast cancer dataset (unnormalized features)')
    print('Accuracy of RBF-kernel SVC on training set: {:.2f}'.format(
        clf.score(X_train_scaled, y_train)))
    print('Accuracy of RBF-kernel SVC on test set: {:.2f}'.format(
        clf.score(X_test_scaled, y_test)))
jitter_delta = 0.25
X_twovar_train = X_train[:,[20,59]]+ np.random.rand(X_train.shape[0], 2) - jitter_delta
X_twovar_test  = X_test[:,[20,59]] + np.random.rand(X_test.shape[0], 2) - jitter_delta

clf = SVC(kernel = 'linear').fit(X_twovar_train, y_train)
grid_values = {'class_weight':['balanced', {1:2},{1:3},{1:4},{1:5},{1:10},{1:20},{1:50}]}
plt.figure(figsize=(9,6))
for i, eval_metric in enumerate(('precision','recall', 'f1','roc_auc')):
    grid_clf_custom = GridSearchCV(clf, param_grid=grid_values, scoring=eval_metric)
    grid_clf_custom.fit(X_twovar_train, y_train)
    print('Grid best parameter (max. {0}): {1}'
          .format(eval_metric, grid_clf_custom.best_params_))
    print('Grid best score ({0}): {1}'
          .format(eval_metric, grid_clf_custom.best_score_))
    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    plot_class_regions_for_classifier_subplot(grid_clf_custom, X_twovar_test, y_test, None,
                                             None, None,  plt.subplot(2, 2, i+1))
    
    plt.title(eval_metric+'-oriented SVC')
plt.tight_layout()
plt.show()


# #### Precision-recall curve for the default SVC classifier (with balanced class weights)

# In[32]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from adspy_shared_utilities import plot_class_regions_for_classifier
from sklearn.svm import SVC
    }, {
        1: 3
    }, {
        1: 4
    }, {
        1: 5
    }, {
        1: 10
    }, {
        1: 20
    }, {
        1: 50
    }]
}
plt.figure(figsize=(9, 6))
for i, eval_metric in enumerate(('precision', 'recall', 'f1', 'roc_auc')):
    grid_clf_custom = GridSearchCV(clf,
                                   param_grid=grid_values,
                                   scoring=eval_metric)
    grid_clf_custom.fit(X_train, y_train)
    print('Grid best parameter (max. {0}): {1}'.format(
        eval_metric, grid_clf_custom.best_params_))
    print('Grid best score ({0}): {1}'.format(eval_metric,
                                              grid_clf_custom.best_score_))
    plt.subplots_adjust(wspace=0.3, hspace=0.3)
    plot_class_regions_for_classifier_subplot(grid_clf_custom, X_test, y_test,
                                              None)

    plt.title(eval_metric + '-oriented SVC')
plt.tight_layout()
plt.show()
Exemplo n.º 17
0
# #### Logistic regression for binary classification on fruits dataset using height, width features (positive class: apple, negative class: others)

# In[ ]:

from sklearn.linear_model import LogisticRegression
from adspy_shared_utilities import (plot_class_regions_for_classifier_subplot)

fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
y_fruits_apple = y_fruits_2d == 1  # make into a binary problem: apples vs everything else
X_train, X_test, y_train, y_test = (train_test_split(
    X_fruits_2d.as_matrix(), y_fruits_apple.as_matrix(), random_state=0))

clf = LogisticRegression(C=100).fit(X_train, y_train)
plot_class_regions_for_classifier_subplot(
    clf, X_train, y_train, None, None, 'Logistic regression \
for binary classification\nFruit dataset: Apple vs others', subaxes)

h = 6
w = 8
print('A fruit with height {} and width {} is predicted to be: {}'.format(
    h, w, ['not an apple', 'an apple'][clf.predict([[h, w]])[0]]))

h = 10
w = 7
print('A fruit with height {} and width {} is predicted to be: {}'.format(
    h, w, ['not an apple', 'an apple'][clf.predict([[h, w]])[0]]))
subaxes.set_xlabel('height')
subaxes.set_ylabel('width')

print('Accuracy of Logistic regression classifier on training set: {:.2f}'.
Exemplo n.º 18
0
def logistic():

    fruits = pd.read_table('fruit_data_with_colors.txt')
    X_fruits_2d = fruits[['height', 'width']]
    y_fruits_2d = fruits['fruit_label']

    fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
    y_fruits_apple = y_fruits_2d == 1
    X_train, X_test, y_train, y_test = (train_test_split(
        X_fruits_2d.as_matrix(), y_fruits_apple.as_matrix(), random_state=0))
    clf = LogisticRegression(C=100).fit(X_train, y_train)
    plot_class_regions_for_classifier_subplot(
        clf, X_train, y_train, None, None,
        'Logistic regression for binary classification\nFruit dataset: Apple vs others',
        subaxes)

    h = 6
    w = 8
    print('A fruit with height {} and width {} is predicted to be: {}'.format(
        h, w, ['not an apple', 'an apple'][clf.predict([[h, w]])[0]]))

    h = 10
    w = 7
    print('A fruit with height {} and width {} is predicted to be: {}'.format(
        h, w, ['not an apple', 'an apple'][clf.predict([[h, w]])[0]]))
    subaxes.set_xlabel('height')
    subaxes.set_ylabel('width')

    print('Accuracy of Logistic regression classifier on training set: {:.2f}'.
          format(clf.score(X_train, y_train)))
    print('Accuracy of Logistic regression classifier on test set: {:.2f}'.
          format(clf.score(X_test, y_test)))

    X_C2, y_C2 = make_classification(n_samples=100,
                                     n_features=2,
                                     n_redundant=0,
                                     n_informative=2,
                                     n_clusters_per_class=1,
                                     flip_y=0.1,
                                     class_sep=0.5,
                                     random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X_C2,
                                                        y_C2,
                                                        random_state=0)
    fig, subaxes = plt.subplots(1, 1, figsize=(7, 5))
    clf = LogisticRegression().fit(X_train, y_train)
    title = 'Logistic regression, simple synthetic dataset C = {:.3f}'.format(
        1.0)
    plot_class_regions_for_classifier_subplot(clf, X_train, y_train, None,
                                              None, title, subaxes)

    print('Accuracy of Logistic regression classifier on training set: {:.2f}'.
          format(clf.score(X_train, y_train)))
    print('Accuracy of Logistic regression classifier on test set: {:.2f}'.
          format(clf.score(X_test, y_test)))

    X_train, X_test, y_train, y_test = (train_test_split(
        X_fruits_2d.as_matrix(), y_fruits_apple.as_matrix(), random_state=0))
    fig, subaxes = plt.subplots(3, 1, figsize=(4, 10))

    for this_C, subplot in zip([0.1, 1, 100], subaxes):
        clf = LogisticRegression(C=this_C).fit(X_train, y_train)
        title = 'Logistic regression (apple vs rest), C = {:.3f}'.format(
            this_C)

        plot_class_regions_for_classifier_subplot(clf, X_train, y_train,
                                                  X_test, y_test, title,
                                                  subplot)
    plt.tight_layout()

    cancer = load_breast_cancer()
    X_cancer, y_cancer = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X_cancer,
                                                        y_cancer,
                                                        random_state=0)
    clf = LogisticRegression().fit(X_train, y_train)
    print('Breast cancer dataset')
    print('Accuracy of Logistic regression classifier on training set: {:.2f}'.
          format(clf.score(X_train, y_train)))
    print('Accuracy of Logistic regression classifier on test set: {:.2f}'.
          format(clf.score(X_test, y_test)))