def DecisionTreeRegressor(self): clf = DecisionTreeRegressor(random_state=self.random_state) path = clf.cost_complexity_pruning_path(self.X_, self.y_) previous_nodes = -1 best_nsc = 1 best_model = None # For every possible prunning point in reverse order for ccp_alpha in reversed(path.ccp_alphas): model = DecisionTreeRegressor(ccp_alpha=ccp_alpha, random_state=self.random_state) model.fit(self.X_, self.y_) # Skip if nothing has changed if model.tree_.node_count == previous_nodes: continue previous_nodes = model.tree_.node_count new_nsc = self.nescience_.nescience(model) if new_nsc < best_nsc: best_nsc = new_nsc best_model = model else: break return (best_nsc, best_model, None)
def item_E_prunning(train_x, train_y, val_x, val_y, plot_option=False): E_tree = DecisionTreeRegressor(random_state=0) parameters = E_tree.cost_complexity_pruning_path(train_x, train_y) ccp_alphas, impurities = parameters.ccp_alphas, parameters.impurities regressor_forest = [] for ccp_alpha in ccp_alphas: regressor_tree = arbol_decision(DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha)) regressor_tree.train_tree(train_x, train_y) regressor_forest.append(regressor_tree) nodo_per_tree= [arbol.tree.tree_.node_count for arbol in regressor_forest] max_depth = [arbol.tree.tree_.max_depth for arbol in regressor_forest] train_scores = [arbol.error(train_x, train_y) for arbol in regressor_forest] test_scores = [arbol.error(val_x, val_y) for arbol in regressor_forest] fig,ax = plt.subplots() ax.set_ylabel("error") ax.set_xlabel("ccp $\\alpha$") ax.plot(ccp_alphas[:-2], train_scores[:-2], marker='o', label="train",drawstyle="steps-post", color=cmap(1), alpha=0.7) ax.plot(ccp_alphas[:-2], test_scores[:-2], marker='s', label="val",drawstyle="steps-post", color=cmap(2), alpha=0.7) ax.legend(loc='center') ax2=ax.twinx() ax2.plot(ccp_alphas[:-2], max_depth[:-2], marker='^', label="profundidad",drawstyle="steps-post", color=cmap(3), alpha=0.7) ax2.set_ylabel("Profundidad") ax2.legend(loc='center right') E_tree_optimo = arbol_decision(DecisionTreeRegressor(max_depth=5, ccp_alpha=0.2)) E_tree_optimo.train_tree(train_x, train_y) y_pred = E_tree_optimo.test_tree(val_x) if plot_option==True: E_tree_optimo.plot_save_tree(val_y, y_pred, "E_tree_optimo.pdf") plt.show() E_tree_optimo.acc_error(y_pred,train_x, train_y, val_x, val_y )
# Adds a grid to the plot plt.grid() # X-axis label plt.ylabel('RMS error') # Y-axis label plt.xlabel('Depth') # Export the plot plt.savefig('error.png') ## The below code contains pruning the decision tree regr = DecisionTreeRegressor(max_depth=10) X_train, X_test, y_train, y_test = process_input() # This is the function which returns ccp alphas and impurity of leaves path = regr.cost_complexity_pruning_path(X_train, y_train) # This will store the alphas and their corresponding impurities ccp_alphas, impurities = path.ccp_alphas, path.impurities plt.figure(figsize=(10, 6)) plt.plot(ccp_alphas, impurities) plt.xlabel("Effective alpha") plt.ylabel("Total Impurity of Leaves") plt.savefig('AlphavsImpurity.png') regrs = [] # Build trees based on different CCP values for ccp_alpha in ccp_alphas: regr = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha, max_depth=10)
predictions_dt = dt.predict(x_test) dt.score(x_test, y_test) mse = mean_squared_error(y_test, predictions_dt) rmse = mse**(1 / 2) print(mse) print(rmse) # ---------- Checking the score by changing complexity parameter from sklearn import tree plt.figure(figsize=(7, 4)) tree.plot_tree(dt, filled=True) path = dt.cost_complexity_pruning_path(x_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities # ---- The weakest link is characterized by an effective alpha, # where the nodes with the smallest effective alpha are pruned first dts = [] for ccp_alpha in ccp_alphas: dt = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha) dt.fit(x_train, y_train) dts.append(dt) print("Number of nodes in the last tree is :{} with ccp_alpha : {}".format( dts[-1].tree_.node_count, ccp_alphas[-1])) train_scores = [dt.score(x_train, y_train) for dt in dts] test_scores = [dt.score(x_test, y_test) for dt in dts] fig, ax = plt.subplots() ax.set_xlabel("alpha")
#select training Y data from Y-996 Y_train = Y_996[index_996] #select the remaining X as testing data X_test = np.delete(X_996, index_996, axis=0) #select the remaining Y as testing data Y_test = np.delete(Y_996, index_996, axis=0) #assign to new variables X_resample = X_train Y_resample = Y_train #call function to find alpha values model = regr.cost_complexity_pruning_path(X_resample, Y_resample.ravel()) #find alpha and impurities ccp_alphas, impurities = model.ccp_alphas, model.impurities for i in range(0, len(ccp_alphas)): if ccp_alphas[i] < 0: #in very unusual cases, the alpha values are negative, find it and make it zero ind_minus_alpha_row.append(m) ind_minus_alpha_column.append(i) minus_alpha.append(ccp_alphas[i]) ccp_alphas[i] = 0 #save alpha, shape = (tree_num, len(seies)) alpha_all.append(ccp_alphas)
#visualize the tree fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 10)) tree.plot_tree(dtr, filled=True) plt.show() # Another alternative is given by pruning the tree which is controlled by setting $\alpha$. Reasonable ranges for $\alpha$ also depend on the data and need to be tested anyway when optimizing the hyperparameter. For instance, a value $\alpha = 0.05$ results in the following structure. # In[5]: #we may use of tree pruning which is controlled by ccp_alpha dtr = DecisionTreeRegressor(ccp_alpha=0.05) #fit the tree dtr.fit(X, y) path = dtr.cost_complexity_pruning_path(X, y) ccp_alphas, impurities = path.ccp_alphas, path.impurities #visualize the tree fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 10)) tree.plot_tree(dtr, filled=True) plt.show() # ## Decision Trees # # So far, we examined a regression problem. How do we derive trees for classification problems? More or less in the same manner, however, we can not use $RSS$ to evaluate model performance. Instead, we need a loss function which is suited for the classification problem. Intuitively, we may want to minimize the classification error. However, it has been found that this does not lead to qualitative tree structures. Instead, at each split either the **Gini index** or **cross-entropy** is used to evaluate the quality of the split. The Gini index $G$ is given by: # # $$ # G = \sum_k \hat{p}_{lk} (1 - \hat{p}_{lk}) # $$ #
model = DecisionTreeRegressor(random_state = 1, max_depth = 20, min_samples_split = 6, min_samples_leaf = 2) model.fit(X_trspl, y_trspl) y_pred = model.predict(X_tespl) rmse = RMSE(y_tespl, y_pred) print(rmse) feat_importances = pd.Series(model.feature_importances_, index=X_trspl.columns) feat_importances.nlargest(20).plot(kind='barh') plt.title('Feature Important based on Decision Tree regressor') plt.xlabel('feature scores') plt.ylabel('feature names') plt.show() # Total impurity of leaves vs effective alphas of pruned tree path = model.cost_complexity_pruning_path(X_trspl, y_trspl) ccp_alphas, impurities = path.ccp_alphas, path.impurities fig, ax = plt.subplots() ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post") ax.set_xlabel("effective alpha") ax.set_ylabel("total impurity of leaves") ax.set_title("Total Impurity vs effective alpha for training set") models = [] for ccp_alpha in ccp_alphas: model = DecisionTreeRegressor(random_state=0, ccp_alpha=ccp_alpha) model.fit(X_trspl, y_trspl) models.append(model) print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format( models[-1].tree_.node_count, ccp_alphas[-1]))