def get_learning_curve(self, learner): step_size = self.data.x_train.shape[0] / 25 steps = xrange(10, self.data.x_train.shape[0], step_size) train_errors = [] validation_errors = [] x_vals = [] for i in steps: print i x_vals.append(i) x_temp, x_remainder, y_temp, y_remainder = ms.train_test_split( self.data.x_train, self.data.y_train, train_size=i, random_state=11) # print x_temp, y_temp train_accuracy, validation_accuracy = sm.kfold_validation(learner, x_temp, y_temp, k=5) train_errors.append(1 - train_accuracy) validation_errors.append(1 - validation_accuracy) img_path = self.data.image_dir + 'boosted_dt_learning_curve.png' plt.plot(steps, train_errors, label='Train Error') plt.plot(steps, validation_errors, label='Validation Error') plt.xlabel('Training Examples') plt.ylabel('Error') plt.title(self.data.name + ' Boosted Decision Tree Learning Curve') plt.legend() plt.savefig(img_path) plt.show() plt.close()
def find_optimal_num_estimators(self, base_learner, learning_rate): num_estimators = [1, 25, 50, 75, 100, 150, 200] print num_estimators train_accuracies = [] validation_accuracies = [] for i in num_estimators: print i boost_learner = ensemble.AdaBoostClassifier( base_learner, n_estimators=i, learning_rate=learning_rate) train_acc, validation_acc = sm.kfold_validation(boost_learner, self.data.x_train, self.data.y_train, k=5) train_accuracies.append(train_acc) validation_accuracies.append(validation_acc) img_path = self.data.image_dir + 'boosted_dt_num_estimators.png' plt.plot(num_estimators, train_accuracies, label='Train Accuracy') plt.plot(num_estimators, validation_accuracies, label='Validation Accuracy') plt.xlabel('# Estimators') plt.ylabel('Accuracy') plt.title(self.data.name + ' Boosted DT Accuracy') plt.legend() plt.savefig(img_path) # plt.show() plt.close()
def get_optimal_leaf_size(self, leaf_sizes): train_accuracys = [] validation_accuracys = [] for i in leaf_sizes: dtl = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=i) train_accuracy, validation_accuracy = sm.kfold_validation( dtl, self.data.x_train, self.data.y_train, k=5) train_accuracys.append(train_accuracy) validation_accuracys.append(validation_accuracy) img_path = self.data.image_dir + 'dt_leaf_size_accuracy.png' plt.plot(leaf_sizes, train_accuracys, label='Train Accuracy') plt.plot(leaf_sizes, validation_accuracys, label='Validation Accuracy') plt.xlabel('Leaf Size') plt.ylabel('Accuracy') plt.title(self.data.name + ' Decision Tree Accuracy by Leaf Size') plt.legend() plt.show() plt.savefig(img_path) plt.close() temp_validation_acc = np.array(validation_accuracys) min_error = temp_validation_acc[1:].max() optimal_leaf_size = temp_validation_acc[1:].argmax() + 2 return min_error, optimal_leaf_size
def show_learning_curve(self, learner): train_errors = [] validation_errors = [] train_examples = xrange(10, self.data.x_train.shape[0], 25) for i in train_examples: x_temp, x_remainder, y_temp, y_remainder = ms.train_test_split( self.data.x_train, self.data.y_train, train_size=i, random_state=11) train_accuracy, validation_accuracy = sm.kfold_validation(learner, x_temp, y_temp, k=5) train_errors.append(1 - train_accuracy) validation_errors.append(1 - validation_accuracy) img_path = self.data.image_dir + 'dt_learning_curve.png' plt.plot(train_examples, train_errors, label='Train Error') plt.plot(train_examples, validation_errors, label='Validation Error') plt.xlabel('Training Examples') plt.ylabel('Error') plt.title(self.data.name + ' Decision Tree Learning Curve') plt.legend() plt.savefig(img_path) plt.show() plt.close()
def find_optimal_k(self): n_neighbors = xrange(1, 150) train_accuracies = [] validation_accuracies = [] for i in n_neighbors: knnl = skl.neighbors.KNeighborsClassifier(n_neighbors=i, weights='uniform', p=1) train_acc, validation_acc = sm.kfold_validation(knnl, self.data.x_train, self.data.y_train, k=5) train_accuracies.append(train_acc) validation_accuracies.append(validation_acc) img_path = self.data.image_dir + 'knn_num_neighbors.png' plt.plot(n_neighbors, train_accuracies, label='Train Accuracy') plt.plot(n_neighbors, validation_accuracies, label='Validation Accuracy') plt.xlabel('# Neighbors') plt.ylabel('Accuracy') plt.title(self.data.name + ' KNN Accuracy by # Neighbors') plt.legend() plt.savefig(img_path) plt.show() plt.close() temp_validation_acc = np.array(validation_accuracies) min_error = temp_validation_acc.max() optimal_num_neighbors = temp_validation_acc.argmax() + 1 return min_error, optimal_num_neighbors