def train(self, data, labels): self._data = data self._labels = labels numAttributes = len(data[0]) # should be same across all data points pAttr = list( range(numAttributes)) # list of attributes to possibly split on rlist = list(range( len(data))) # initially all points are remaining in the data set self._root = Tree.Tree() self._createTree(self._root, pAttr, rlist) # no longer need to remember the training data self._data = [] self._labels = []
def _createTree(self, tree, pAttr, rlist): # check if all members of subset are classified the same first_label = self._labels[rlist[0]] for r in range(1, len(rlist)): i = rlist[r] # relevant index of a data point lbl = self._labels[i] if lbl != first_label: break if r == (len(rlist) - 1): tree.final_label = first_label return # when there are no attributes left to split on if len(pAttr) == 0: tree.chooseBest(self._getLabelCount(rlist)) return gains = [0 for i in range(len(pAttr))] for i in range(len(pAttr)): # iterate over attribute array a = pAttr[i] # current attribute gains[i] = self._gain(a, rlist) maxGain = 0 # index of max gain for i in range(len(gains)): if gains[i] > gains[maxGain]: maxGain = i # if all gains are 0 stop branching and use the most popular label # (in some data sets there may be duplicate vectors with different classifications) if gains[maxGain] == 0: tree.chooseBest(self._getLabelCount(rlist)) return tree.attr = pAttr[maxGain] # attribute to split on del pAttr[maxGain] # remove attribute we're using from list vals_dist = self._valDistribution(tree.attr, rlist) # possible (remaining) vals for this attribute to take on tree.vals = list(vals_dist) tree.subTrees = [Tree.Tree() for i in range(len(tree.vals))] # iterate over each value to branch off of # use deep copies of pAttr! for i in range(len(tree.vals)): # recursively create tree self._createTree(tree.subTrees[i], deepcopy(pAttr), vals_dist[tree.vals[i]])
X = iris.data y = iris.target def getAccuracy(pred_y, actu_y): if len(pred_y) != len(actu_y): raise ("The f**k?") correct = 0 for i in range(0, len(pred_y)): if pred_y[i] == actu_y[i]: correct += 1 return correct / len(pred_y) t = Tree(max_depth=DEPTH, PFSRT=True, omega=1.5) t.train(X, y) t.printTree() t.updatePFSRT() acc = [] for i in range(0, NR_TREES): t.train() # train with same data t.updatePFSRT() acc.append((i, t._cur_accuracy)) acc = sorted(acc, key=lambda kv: kv[1]) for tup in acc: print("Tree " + str(tup[0]) + " acc:", tup[1])
y = iris.target def getAccuracy(pred_y, actu_y): if len(pred_y) != len(actu_y): raise ("The f**k?") correct = 0 for i in range(0, len(pred_y)): if pred_y[i] == actu_y[i]: correct += 1 return correct / len(pred_y) st1 = time() t = Tree(max_depth=DEPTH) t.train(X, y) en1 = time() t.printTree() print(y) st1p = time() y_pred = t.predict(X) en1p = time() #print(y_pred) print(getAccuracy(y, y_pred)) clf = DecisionTreeClassifier(criterion='entropy', max_depth=DEPTH) st2 = time()
def getAccuracy(pred_y, actu_y): if len(pred_y) != len(actu_y): raise ("The f**k?") correct = 0 for i in range(0, len(pred_y)): if pred_y[i] == actu_y[i]: correct += 1 return correct / len(pred_y) st1 = time() t = [] # generate 10 random trees and look at the dist for i in range(0, NR_TREES): t.append(Tree(max_depth=DEPTH, random_feat=True)) t[i].train(X, y) en1 = time() # for i in range(0,NR_TREES): # print("Tree",i) # t[i].printTree() print(y) st1p = time() y_pred = [] for i in range(0, NR_TREES): y_pred.append(t[i].predict(X)) en1p = time() acc_tuples = []
def do_experiments(X, y, depth, nr_rand_trees, data_label): # BASIC TREE print("\n------ BASIC TREE ------\n") print("Depth: ", depth) t = Tree(max_depth=depth) st1 = time() t.train(X, y) en1 = time() st1p = time() y_pred = t.predict(X) en1p = time() basic_acc = accuracy_score(y, y_pred) print(Get_ConfusionMatrix(y, y_pred)) print("F-Score: ", Get_F_Score(y, y_pred)) print("Accuracy: ", basic_acc) print("Time to train:", en1 - st1) print("Time to test:", en1p - st1p) # this is 10 fold cross validation cv_arr = cross_validate(t, X, y, cv=10) print("Accuracy after 10-fold CV:", float(sum(cv_arr)) / max(len(cv_arr), 1), "(", stdev(cv_arr), ")") if len(np.unique(y)) == 2: Generate_ROC_Curve(y, t.getClassProb(X), "ID3", label_text=data_label) # BASIC TREE END # RANDOM TREES print("\n------ RANDOM TREE ------\n") print("Depth: ", depth) print("Nr trees: ", nr_rand_trees) t2 = Tree(max_depth=depth, random_feat=True) t2_max = None y2_pred = None acc_max = 0 iterations_taken = nr_rand_trees acc_list = [] max_accs = [] st2 = time() for i in range(0, nr_rand_trees): t2.train(X, y) y2_pred = t2.predict(X) acc = accuracy_score(y2_pred, y) acc_list.append(acc) if acc >= basic_acc: iterations_taken = i + 1 t2_max = t2 acc_max = acc break if acc > acc_max: acc_max = acc t2_max = t2 max_accs.append(acc_max) en2 = time() st2p = time() y2_pred = t2_max.predict(X) en2p = time() print(Get_ConfusionMatrix(y, y2_pred)) print("F-Score: ", Get_F_Score(y, y2_pred)) print("Accuracy: ", acc_max) print("Time to train:", en2 - st2) print("Iterations taken:", iterations_taken) print("Time to test:", en2p - st2p) # this is 10 fold cross validation cv_arr = cross_validate(t2_max, X, y, cv=10) print("Accuracy after 10-fold CV:", float(sum(cv_arr)) / max(len(cv_arr), 1), "(", stdev(cv_arr), ")") random_decision_tree_accuracy(acc_list, label_text=data_label) accuracyRiseForRandomTrees(max_accs, label_text=data_label) if len(np.unique(y)) == 2: Generate_ROC_Curve(y, t2_max.getClassProb(X), "Random Forest", label_text=data_label) # RANDOM TREES END # LOOKAHEAD TREE print("\n------ LOOKAHEAD TREE ------\n") print("Depth: ", depth) t3 = Tree(max_depth=depth, lookahead=True) st3 = time() t3.train(X, y) en3 = time() print("yo") st3p = time() y3_pred = t3.predict(X) en3p = time() print("yu") basic_acc = accuracy_score(y, y3_pred) print(Get_ConfusionMatrix(y, y3_pred)) print("F-Score: ", Get_F_Score(y, y3_pred)) print("Accuracy: ", accuracy_score(y, y3_pred)) print("Time to train:", en3 - st3) print("Time to test:", en3p - st3p) # this is 10 fold cross validation cv_arr = cross_validate(t3, X, y, cv=5) print("Accuracy after 10-fold CV:", float(sum(cv_arr)) / max(len(cv_arr), 1), "(", stdev(cv_arr), ")") if len(np.unique(y)) == 2: Generate_ROC_Curve(y, t3.getClassProb(X), "Lookahead DT", label_text=data_label)
X=iris.data y=iris.target def getAccuracy(pred_y, actu_y): if len(pred_y) != len(actu_y): raise("The f**k?") correct = 0 for i in range(0, len(pred_y)): if pred_y[i] == actu_y[i]: correct += 1 return correct/len(pred_y) print(X) st1 = time() t=Tree(max_depth=DEPTH, lookahead=True) t.train(X, y) en1 = time() t.printTree() print(y) st1p = time() y_pred=t.predict(X) en1p = time() print(y_pred) print(getAccuracy(y, y_pred)) clf = DecisionTreeClassifier(criterion='entropy', max_depth=DEPTH) st2 = time()