def bagging(X,Y,N,M,F): # 1. split your dataset into a test set and a "remainder set" x_remainder, x_test, y_r, y_test = myevaluation.train_test_split(X, Y) # 2. using the remainder set, sample N bootsrap samples and use each one to build a classifier # for each N sample: # ~63% of the remainder set will be sampled into training set # ~37% will be leftover for this tree's validation set forest = [] # accuracies = [[0] for i in range(N)] accuracies = {} for i in range(N): x_train, y_train = compute_bootstrapped_sample(x_remainder, y_r) #get the bootstrap sample tree = my_class.MyDecisionTreeClassifier() tree.fit(x_train, y_train, True, F) #build classifier # get remainder of x_train and use as validation set x_v = [] y_v = [] for j in range(len(x_remainder)): if x_remainder[j] not in x_train: x_v.append(x_remainder[j]) y_v.append(y_r[j]) pred = tree.predict(x_v) accuracy = get_accuracy(y_v, pred) accuracies[str(i)] = accuracy # {i: accuracy, } forest.append(tree) # 3. measure the performance of the tree on the validation set and select the best M of N # trees based on the performance metrics best_trees_dict = best_M(M, accuracies) best_trees = [] for key in best_trees_dict: best_trees.append(forest[int(key)]) return best_trees
def tune_parameters(M, N, F, dataset): print("M =", M, "N =", N, "F =", F) adjusted_dataset = select_random_attributes(F, dataset.data) for i in range(5): X, y = split_x_y_train(adjusted_dataset) x_train, x_test, y_train, y_test = myevaluation.train_test_split( X, y, shuffle=True) remainder = [] for j in range(len(x_train)): row = x_train[j] row.append(y_train[j]) remainder.append(row) myRF = MyRandomForestClassifier() myRF.fit(remainder, M, N) y_predict_rf = myRF.predict(x_test) count = 0 for l in range(len(y_predict_rf)): binned_predict = get_useful_bin(y_predict_rf[l]) binned_test = get_useful_bin(y_test[l]) if (binned_predict == binned_test): count = count + 1 accuracy = count / len(y_predict_rf) error = (len(y_predict_rf) - count) / len(y_predict_rf) print(i, "-- accuracy =", accuracy, "error =", error)
def fit(self, X_train, y_train): """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples Notes: Since TDIDT is an eager learning algorithm, this method builds a decision tree model from the training data. Build a decision tree using the nested list representation described in class. Store the tree in the tree attribute. Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...). """ header = ['att' + str(i) for i in range(len(X_train[0]))] attribute_domains = {} for i, val in enumerate(header): attribute_domains[val] = myutils.unique_index(X_train, i) self.X_train = X_train self.y_train = y_train sample_X_train, sample_x_test, sample_y_train, sample_y_test = myevaluation.train_test_split( X_train, y_train, test_size=0.33, shuffle=True) train = [ sample_X_train[i] + [sample_y_train[i]] for i in range(len(sample_X_train)) ] for _ in range(self.N): available_attributes = header.copy() self.trees.append( myutils.tdidt_forest( myutils.compute_bootstrapped_sample(train), available_attributes, attribute_domains, header, self.F)) accuracies = [] for tree in self.trees: header = ['att' + str(i) for i in range(len(sample_x_test[0]))] prediction = [] for row in sample_x_test: prediction.append(myutils.tdidt_predict(header, tree, row)) accuracy = 0 for i in range(len(prediction)): if prediction[i] == sample_y_test[i]: accuracy += 1 accuracy /= len(sample_y_test) accuracies.append([accuracy]) # find m most accurate m_trees = [] for i in range(len(accuracies)): accuracies[i].append(i) accuracies = sorted(accuracies) for i in range(self.M): m_trees.append(self.trees[accuracies[-(i + 1)][1]]) self.trees = m_trees
def bagging(X, Y, N, M, F): # 1. split your dataset into a test set and a "remainder set" x_remainder, x_test, y_r, y_test = myevaluation.train_test_split(X, Y) # 2. using the remainder set, sample N bootsrap samples and use each one to build a classifier # for each N sample: # ~63% of the remainder set will be sampled into training set # ~37% will be leftover for this tree's validation set forest = [] # accuracies = [[0] for i in range(N)] accuracies = {} for i in range(N): x_train, y_train = compute_bootstrapped_sample( x_remainder, y_r) #get the bootstrap sample tree = my_class.MyDecisionTreeClassifier() tree.fit(x_train, y_train, True, F) #build classifier # get remainder of x_train and use as validation set x_v = [] y_v = [] for j in range(len(x_remainder)): if x_remainder[j] not in x_train: x_v.append(x_remainder[j]) y_v.append(y_r[j]) pred = tree.predict(x_v) accuracy = get_accuracy(y_v, pred) accuracies[str(i)] = accuracy # {i: accuracy, } forest.append(tree) # 3. measure the performance of the tree on the validation set and select the best M of N # trees based on the performance metrics best_trees_dict = best_M(M, accuracies) best_trees = [] for key in best_trees_dict: best_trees.append(forest[int(key)]) # 4. using majority voting, make predictions from the M learners for each instance in the test set all_predictions = [] # [[predictions1],[predictions2]...] for tree in best_trees: pred = tree.predict(x_test) all_predictions.append(pred) #think about this like flipping a table #get the majority for every single row pred_header = build_header( all_predictions) #turn all predictions into a mypy pred_mypy = MyPyTable(pred_header, all_predictions) voted_predictions = [] for i in range( len(all_predictions[0]) ): #loop through every x_test, create a column of predictions, pick the pred by majority rule pred_col = pred_mypy.get_column(i) vals, counts = get_freq_str(pred_col) j = counts.index(max(counts)) y_predict = vals[j] voted_predictions.append(y_predict) forest_accuracy = get_accuracy(y_test, voted_predictions) return best_trees, voted_predictions, forest_accuracy
def test_random_forest_fit(): # interview dataset table = [["Senior", "Java", "no", "no", "False"], ["Senior", "Java", "no", "yes", "False"], ["Mid", "Python", "no", "no", "True"], ["Junior", "Python", "no", "no", "True"], ["Junior", "R", "yes", "no", "True"], ["Junior", "R", "yes", "yes", "False"], ["Mid", "R", "yes", "yes", "True"], ["Senior", "Python", "no", "no", "False"], ["Senior", "R", "yes", "no", "True"], ["Junior", "Python", "yes", "no", "True"], ["Senior", "Python", "yes", "yes", "True"], ["Mid", "Python", "no", "yes", "True"], ["Mid", "Java", "yes", "no", "True"], ["Junior", "Python", "no", "yes", "False"]] X, y = myutils.split_x_y_train(table) x_train, x_test, y_train, y_test = myevaluation.train_test_split( X, y, math.floor(len(table) * 0.33), shuffle=True) remainder = [] for i in range(len(x_train)): row = x_train[i] row.append(y_train[i]) remainder.append(row) print(remainder) myRF = MyRandomForestClassifier() myRF.fit(remainder, 10, 100) y_predicted = myRF.predict(x_test) assert len(y_predicted) == len(y_test) count = 0 for i in range(len(y_predicted)): if y_predicted[i] == y_test[i]: count += 1 assert count != 0
from mysklearn.myclassifiers import MyDecisionTreeClassifier from mysklearn.mypytable import MyPyTable import mysklearn.myutils as myutils import mysklearn.myevaluation as myevaluation stars_table = myutils.load_data("Stars.csv") temperature = myutils.temp_bins(stars_table.get_column('Temperature')) L = myutils.luminosity_bins(stars_table.get_column('L')) R = myutils.get_radius(stars_table.get_column('R')) a_m = myutils.get_magnitude(stars_table.get_column('A_M')) color = myutils.categorize_colors(stars_table.get_column('Color')) spectral_class = myutils.get_spectral_class(stars_table.get_column('Spectral_Class')) star_type = stars_table.get_column('Type') x_vals = [[temperature[i], str(L[i]), str(R[i]), str(a_m[i]), color[i], spectral_class[i]] for i in range(len(stars_table.data))] y_vals = star_type xtr, xts, ytr, yts = myevaluation.train_test_split(x_vals, y_vals) my_tree = MyDecisionTreeClassifier() my_tree.fit(xtr, ytr) predicted = my_tree.predict(xts) accuracy = myutils.compute_accuracy(predicted, yts) print('My Decision Tree: Accuracy =', round(accuracy * 100, 3), 'Error Rate = ', round((1-accuracy) * 100, 3)) # pickle classifier with open("decision_tree.p", "wb") as fout: pkl_obj = my_tree.tree pickle.dump(my_tree, fout)
def fit(self, X_train, y_train, N, M, F): ''' Fits a random forest to the training data Args: X_train: the data to train the random forest N: number of trees to be generated M: "best M" trees F: Number of attributes to select from ''' self.N = N self.M = M self.F = F xRemainder, xTest, yRemainder, yTest = myeval.train_test_split(X_train, y_train) # split into remainder and test sets remainderSet = [] testSet = [] # piece together test and remainder sets for i in range(len(xTest)): testSet.append(xTest[i] + [yTest[i]]) for i in range(len(xRemainder)): remainderSet.append(xRemainder[i] + [yRemainder[i]] + [i]) # add index at end for uniqueness allTrees = [] allAccuracies = [] for j in range(N): copySet = copy.deepcopy(remainderSet) validationSet = [] bootstrapSample = [] bootstrapSample = myutils.computeBootstrappedSample(copySet) # create a bootstrap sample training set # determine the validation set for i in range(len(remainderSet)): if copySet[i] not in bootstrapSample: validationSet.append(copySet[i]) yTest = [] for i in range(len(validationSet)): validationSet[i].pop() yTest.append(validationSet[i].pop()) yTrain = [] #print("BOoT", bootstrapSample) for i in range(len(bootstrapSample)): yTrain.append(bootstrapSample[i][-2]) bootstrapSample[i] = bootstrapSample[i][:-2] decisionTree = MyDecisionTreeClassifier() decisionTree.fit(bootstrapSample, yTrain, F) predictions = decisionTree.predict(validationSet) currAccuracy = myutils.determineAccuracy(predictions, yTest) allAccuracies.append(currAccuracy) allTrees.append(decisionTree) bestMTrees = [] for k in range(M): index = allAccuracies.index(max(allAccuracies)) allAccuracies.pop() bestMTrees.append(allTrees.pop(index)) self.bestM = bestMTrees return testSet
def fit(self, X_train, y_train, M=7, N=20, F=2): """Fits a random forest classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm. Args: X_train(list of list of obj): The list of training instances (samples). The shape of X_train is (n_train_samples, n_features) y_train(list of obj): The target y values (parallel to X_train) The shape of y_train is n_train_samples """ self.X_train = copy.deepcopy(X_train) self.y_train = copy.deepcopy(y_train) # create random stratified test set with 2:1 ratio X_remainder, X_test, y_remainder, y_test = myevaluation.train_test_split( copy.deepcopy(X_train), copy.deepcopy(y_train)) for i, x in enumerate(y_remainder): X_remainder[i].append(x) for i, x in enumerate(y_test): X_test[i].append(x) # generate N random decision trees using bagging trees = [] for i in range(N): # print(i) # print("getting sample and validation sets...") # get the sample and validation sets sample = myutils.compute_bootstrapped_sample(X_remainder) validation_set = [] for x in X_remainder: if x not in sample: validation_set.append(x) # print("length of sample and validation sets:", len(sample), len(validation_set)) # print("getting the tree...") # get the tree from the sample available_attributes = myutils.get_available_attributes(sample) tree = myutils.tdidt_random_forest( sample, [x for x in range(0, len(sample[0]) - 1)], available_attributes, F) # print("testing the tree") # test against the validation set validation_set_x = [x[:-1] for x in validation_set] validation_set_y = [x[-1] for x in validation_set] predictions = [] header = [] for i in range(0, len(validation_set_x[0])): header.append("att" + str(i)) for x, y in zip(validation_set_x, validation_set_y): prediction = myutils.tdidt_predict(header, tree, x) predictions.append(int(prediction == y)) # print("accuracy:", sum(predictions)/len(predictions)) trees.append({ "accuracy": sum(predictions) / len(predictions), "tree": tree }) # print("getting the best M trees") # get the best M of N trees trees = sorted(trees, key=lambda k: k["accuracy"], reverse=True) self.trees = trees[:M]
def test_decision_tree_classifier_fit(): X_tr, X_t, y_tr, y_t = myevaluation.train_test_split( interviewData, interviewClasses) interviewTest.fit(X_tr, y_tr) assert len(interviewTest.best_M_trees) == M