from sklearn.datasets import load_boston from sklearn import tree def plot_pruned_path(scores, with_std=True): """Plots the cross validated scores versus the number of leaves of trees""" import matplotlib.pyplot as plt means = np.array([np.mean(s) for s in scores]) stds = np.array([np.std(s) for s in scores]) / np.sqrt(len(scores[1])) x = range(len(scores) + 1, 1, -1) plt.plot(x, means) if with_std: plt.plot(x, means + 2 * stds, lw=1, c='0.7') plt.plot(x, means - 2 * stds, lw=1, c='0.7') plt.xlabel('Number of leaves') plt.ylabel('Cross validated score') boston = load_boston() clf = tree.DecisionTreeRegressor(max_depth=8) #Compute the cross validated scores scores = tree.prune_path(clf, boston.data, boston.target, max_n_leaves=20, n_iterations=10, random_state=0) plot_pruned_path(scores)
plt.plot(x, means + 2 * stds, lw=1, c='0.7') plt.plot(x, means - 2 * stds, lw=1, c='0.7') plt.xlabel('Number of leaves') plt.ylabel('Cross validated score') # Create a random dataset rng = np.random.RandomState(1) X = np.sort(5 * rng.rand(80, 1), axis=0) y = np.sin(X).ravel() y[1::5] += 3 * (0.5 - rng.rand(16)) clf = tree.DecisionTreeRegressor(max_depth=20) scores = tree.prune_path(clf, X, y, max_n_leaves=20, n_iterations=100, random_state=0) plot_pruned_path(scores) clf = tree.DecisionTreeRegressor(max_depth=20, n_leaves=15) clf.fit(X, y) X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] #Prepare the different pruned level y_15 = clf.predict(X_test) clf = clf.prune(6) y_7 = clf.predict(X_test) clf = clf.prune(2) y_2 = clf.predict(X_test)
plt.plot(x, means - 2 * stds, lw=1, c='0.7') plt.xlabel('Number of leaves') plt.ylabel('Cross validated score') # Create a random dataset rng = np.random.RandomState(1) X = np.sort(5 * rng.rand(80, 1), axis=0) y = np.sin(X).ravel() y[1::5] += 3 * (0.5 - rng.rand(16)) clf = tree.DecisionTreeRegressor(max_depth=20) scores = tree.prune_path(clf, X, y, max_n_leaves=20, n_iterations=100, random_state=0) plot_pruned_path(scores) clf = tree.DecisionTreeRegressor(max_depth=20, n_leaves=15) clf.fit(X, y) X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] #Prepare the different pruned level y_15 = clf.predict(X_test) clf = clf.prune(6) y_7 = clf.predict(X_test) clf = clf.prune(2)
import matplotlib.pyplot as plt means = np.array([np.mean(s) for s in scores]) stds = np.array([np.std(s) for s in scores]) / np.sqrt(len(scores[1])) x = range(len(scores) + 1, 1, -1) plt.plot(x, means) if with_std: plt.plot(x, means + 2 * stds, lw=1, c='0.7') plt.plot(x, means - 2 * stds, lw=1, c='0.7') plt.xlabel('Number of leaves') plt.ylabel('Cross validated score') plt.show() print "loading training dataset features" traindata=np.asarray(pickle.load(open("data/traindata-allfeatures.list","r"))).astype(np.float) print "loading class labels for training dataset" target=np.asarray(pickle.load(open("data/target.list","r"))).astype(np.float) clf = tree.DecisionTreeClassifier() #Compute the cross validated scores scores = tree.prune_path(clf, traindata, target, max_n_leaves=10, n_iterations=5, random_state=0) plot_pruned_path(scores)
def plot_pruned_path(scores, with_std=True): """Plots the cross validated scores versus the number of leaves of trees""" import matplotlib.pyplot as plt means = np.array([np.mean(s) for s in scores]) stds = np.array([np.std(s) for s in scores]) / np.sqrt(len(scores[1])) x = range(len(scores) + 1, 1, -1) plt.plot(x, means) if with_std: plt.plot(x, means + 2 * stds, lw=1, c='0.7') plt.plot(x, means - 2 * stds, lw=1, c='0.7') plt.xlabel('Number of leaves') plt.ylabel('Cross validated score') boston = load_boston() clf = tree.DecisionTreeRegressor(max_depth=8) #Compute the cross validated scores scores = tree.prune_path(clf, boston.data, boston.target, max_n_leaves=20, n_iterations=10, random_state=0) plot_pruned_path(scores)