示例#1
0
# #### Setting max decision tree depth to help avoid overfitting

# In[ ]:

clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(
    clf2.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(
    clf2.score(X_test, y_test)))

# #### Visualizing decision trees

# In[ ]:

plot_decision_tree(clf, iris.feature_names, iris.target_names)

# #### Pre-pruned version (max_depth = 3)

# In[ ]:

plot_decision_tree(clf2, iris.feature_names, iris.target_names)

# #### Feature importance

# In[ ]:

from adspy_shared_utilities import plot_feature_importances

plt.figure(figsize=(10, 4), dpi=80)
plot_feature_importances(clf, iris.feature_names)
def mess_with_iris():
    #from adspy_shared_utilities import plot_feature_importances
    from adspy_shared_utilities import plot_decision_tree
    from sklearn.model_selection import train_test_split
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.datasets import load_iris
    import pandas as pd
    import numpy as np

    iris = load_iris()
    #iris.feature_names#>> ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

    # separate data
    seplen, sepwid, petlen, petwid = iris.data[:,
                                               0], iris.data[:,
                                                             1], iris.data[:,
                                                                           2], iris.data[:,
                                                                                         3]

    # turn sklearn.datasets.base.Bunch data into pandas dataframe
    df = pd.DataFrame(np.c_[iris.data, iris.target],
                      columns=iris['feature_names'] + ['target'])

    # define new variable of (sepal length / sepal width)
    seplenwid = seplen / sepwid
    #seplenwid == (df['sepal length (cm)']/df['sepal width (cm)']).tolist()

    # make list into dataframe
    df_seplenwid = pd.DataFrame({'sepal (length/width)': seplenwid})

    # merge new dataframes to original dataframe, as columns
    df = df.join(df_seplenwid)
    df = df.join(
        pd.DataFrame({
            'petal (length/width)':
            (df['petal length (cm)'] / df['petal width (cm)']).tolist()
        }))

    # add more columns, comparing sepal and pedal lengths and widths
    df = df.join(
        pd.DataFrame({
            'sep.len/pet.wid':
            (df['sepal length (cm)'] / df['petal width (cm)']).tolist()
        }))
    df = df.join(
        pd.DataFrame({
            'sep.wid/pet.len':
            (df['sepal width (cm)'] / df['petal length (cm)']).tolist()
        }))

    # separate df into data and target, for ML classifier
    df_no_target = df[[
        'sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
        'sepal (length/width)', 'petal (length/width)', 'sep.len/pet.wid',
        'sep.wid/pet.len'
    ]]
    target = np.array(df['target'].tolist())

    # split data for ML classifier
    X_train, X_test, y_train, y_test = train_test_split(df_no_target,
                                                        target,
                                                        random_state=0)

    clf = DecisionTreeClassifier().fit(X_train, y_train)
    #print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
    #print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))
    #>> Accuracy of Decision Tree classifier on training set: 1.00
    #>> Accuracy of Decision Tree classifier on test set: 0.97
    """clf2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train)
    #print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(clf2.score(X_train, y_train)))
    #print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(clf2.score(X_test, y_test)))
    #>> Accuracy of Decision Tree classifier on training set: 0.98
    #>> Accuracy of Decision Tree classifier on test set: 0.95"""

    # make list of new feature names, for better classification
    feature_names_list = df_no_target.keys().tolist()

    # make bar plot of feature importance
    def feature_importance():
        from adspy_shared_utilities import plot_feature_importances
        import matplotlib.pyplot as plt
        ##################################################
        ### Jupyter Notebooks Needs This Line
        #%matplotlib notebook
        ##################################################
        #plt.clf()
        plt.figure(figsize=(10, 4), dpi=80)
        plot_feature_importances(clf, feature_names_list)
        #print('Feature importances: {}'.format(clf.feature_importances_))
        print(
            'Feature importances: {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}, {:.3f}'
            .format(
                clf.feature_importances_[0],
                clf.feature_importances_[1],
                clf.feature_importances_[2],
                clf.feature_importances_[3],
                clf.feature_importances_[4],
                clf.feature_importances_[5],
                clf.feature_importances_[6],
            ))
        return plt.show()

    # call function to make bar plot
    feature_importance()

    #return iris.feature_names#>> ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
    return plot_decision_tree(clf, feature_names_list, iris.target_names)


#mess_with_iris()
示例#3
0
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}\n'
     .format(clf.score(X_test, y_test)))

# Setting max decision tree depth to help avoid overfitting:
clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)

print('Accuracy of Decision Tree classifier (max decisions = 3) on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}\n'
     .format(clf.score(X_test, y_test)))

# Visualizing decision trees (TODO: make it display):
plt.figure()
plot_decision_tree(clf, iris.feature_names, iris.target_names) # (Figure 22)
# Color intensity represents which majority class is present in each node.
# values section corresponds to how many training instances belong in each class.

# Visualizing (pre-pruned version max_depth = 3)
plt.figure()
plot_decision_tree(clf2, iris.feature_names, iris.target_names) # (Figure 23)

# Feature importance
from adspy_shared_utilities import plot_feature_importances

plt.figure(figsize=(10,4), dpi=80) # Dots per inch
plot_feature_importances(clf, iris.feature_names) # (Figure 24)
print("Feature importances: {}\n".format(clf.feature_importances_)) # Inherent property of classifier, not user-defined property

#from sklearn.tree import DecisionTreeClassifier
示例#4
0
print('Breast cancer dataset: decision tree')
print('Accuracy of DT classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of DT classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

print('test score of the model :'+str(clf.score(X_test,y_test)))


#5.Tuning the model.
clf = DecisionTreeClassifier(random_state = 0)
parameters = {
    'max_depth': range(1,12),
    'criterion' : ('gini', 'entropy'),
    'max_features' : ('auto', 'sqrt', 'log2'),
    'min_samples_leaf' : (2,4,6,8,10,12),
    'min_samples_split' : (2,4,6,8,10,12)
}

DT_grid = RandomizedSearchCV(clf, param_distributions = parameters, cv =5, verbose = True)
DT_grid.fit(X_train,y_train)


#5.Evaluating the score of thw model.
print('train score of the model :'+str(DT_grid.score(X_train,y_train)))
print('test score of the model :'+str(DT_grid.score(X_test,y_test)))
print(DT_grid.best_estimator_)

plot_decision_tree(DT_grid.best_estimator_, cancer.feature_names, cancer.target_names)
示例#5
0
def decisiontree():

    iris = load_iris()
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=3)
    clf = DecisionTreeClassifier().fit(X_train, y_train)

    print(
        'Accuracy of Decision Tree classifier on training set: {:.2f}'.format(
            clf.score(X_train, y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(
        clf.score(X_test, y_test)))

    clf2 = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
    print(
        'Accuracy of Decision Tree classifier on training set: {:.2f}'.format(
            clf2.score(X_train, y_train)))
    print('Accuracy of Decision Tree classifier on test set: {:.2f}'.format(
        clf2.score(X_test, y_test)))

    plot_decision_tree(clf, iris.feature_names, iris.target_names)
    plot_decision_tree(clf2, iris.feature_names, iris.target_names)

    plt.figure(figsize=(10, 4), dpi=80)
    plot_feature_importances(clf, iris.feature_names)
    plt.show()

    print('Feature importances: {}'.format(clf.feature_importances_))

    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=0)
    fig, subaxes = plt.subplots(6, 1, figsize=(6, 32))
    pair_list = [[0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3]]
    tree_max_depth = 4

    for pair, axis in zip(pair_list, subaxes):
        X = X_train[:, pair]
        y = y_train
        clf = DecisionTreeClassifier(max_depth=tree_max_depth).fit(X, y)
        title = 'Decision Tree, max_depth = {:d}'.format(tree_max_depth)
        plot_class_regions_for_classifier_subplot(clf, X, y, None, None, title,
                                                  axis, iris.target_names)
        axis.set_xlabel(iris.feature_names[pair[0]])
        axis.set_ylabel(iris.feature_names[pair[1]])

    plt.tight_layout()
    plt.show()

    cancer = load_breast_cancer()
    X_cancer, y_cancer = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(X_cancer,
                                                        y_cancer,
                                                        random_state=0)
    clf = DecisionTreeClassifier(max_depth=4,
                                 min_samples_leaf=8,
                                 random_state=0).fit(X_train, y_train)
    plot_decision_tree(clf, cancer.feature_names, cancer.target_names)

    print('Breast cancer dataset: decision tree')
    print('Accuracy of DT classifier on training set: {:.2f}'.format(
        clf.score(X_train, y_train)))
    print('Accuracy of DT classifier on test set: {:.2f}'.format(
        clf.score(X_test, y_test)))

    plt.figure(figsize=(10, 6), dpi=80)
    plot_feature_importances(clf, cancer.feature_names)
    plt.tight_layout()
    plt.show()