def part2(): """randomly choose 5%, 10%, 20%, 50%, 100% samples to train, and choose 10 sets each time""" plt.figure() for trainFileName, testFileName, key in [ ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'), ('../heart_train.arff', '../heart_test.arff', 'heart') ]: attribute, trainset = data_provider(trainFileName) testAttribute, testset = data_provider(testFileName) m = 4 avgPoints = [] maxPoints = [] minPoints = [] for rate in (0.05, 0.1, 0.2, 0.5, 1): accuracys = [] for newTrainset in selectSample(trainset, rate): root = TreeNode(newTrainset, attribute) curTree = DecisionTree(root) curTree.createTree(root, m) trueSamples = 0 falseSamples = 0 for instance in testset: if curTree.predict(root, instance) == instance[-1]: trueSamples += 1 else: falseSamples += 1 accuracys.append( float(trueSamples) / (trueSamples + falseSamples)) accuracy = float(sum(accuracys)) / len(accuracys) avgPoints.append([int(rate * 100), accuracy]) maxPoints.append([int(rate * 100), max(accuracys)]) minPoints.append([int(rate * 100), min(accuracys)]) mapping = {'diabetes': 1, 'heart': 2} ax = plt.subplot(1, 2, mapping[key]) ax.set_xlim(0, 105) ax.set_ylim(0.45, 0.9) ax.set_ylabel('accuracy') ax.set_title(key) ax.plot([x[0] for x in avgPoints], [x[1] for x in avgPoints], label='average') ax.plot([x[0] for x in maxPoints], [x[1] for x in maxPoints], label='maximum') ax.plot([x[0] for x in minPoints], [x[1] for x in minPoints], label='minimum') ax.legend() plt.xlabel('dataset sample percentage') plt.savefig('../part2.pdf')
def __init__(self, n_trees, tree_config=None): # By default we want some randomness in the trees default_tree_config = dict(cut_dim="random_best") tree_config = {**default_tree_config, **(tree_config or {})} self.trees = [DecisionTree(**tree_config) for i in range(n_trees)]
def part1(t_data, v_data): tree = DecisionTree() tree_list = [tree] ts = time.time() create_dt_classifier(t_data, 9, tree, 0) print("Completed ", " Time : ", (time.time() - ts)) train_accuracy_list = [] val_accuracy_list = [] iterations = [] for i in range(0, 10): train_accuracy = check_accuracy_with_trees(t_data, tree_list, i) train_accuracy_list.append(train_accuracy) val_accuracy = check_accuracy_with_trees(v_data, tree_list, i) val_accuracy_list.append(val_accuracy) iterations.append(i) accuracy = [train_accuracy_list, val_accuracy_list] iters = [iterations, iterations] print("Completed ", " Time : ", (time.time() - ts)) print(accuracy) legends = ["Training", "Validation"] labels = ["Accuracy in %", "Depth"] plot(iters, accuracy, "Accuracy Vs Depth", legends, labels)
def create_tree(_dataset): ''' :param _dataset: dataset that will be assigned to the root of the tree :return: return a DecisionTree object (not trained) ''' return DecisionTree(_dataset)
def __init__(self, x, y, depth=2): """ In the constructor we instantiate nn.Linear modules and assign them as member variables. """ H = 100 D_out = 1 self.dTree = DecisionTree(x, y, idxs=range(len(y)), depth=2) super(treeNet, self).__init__() self.linear1 = torch.nn.Linear(self.dTree.D_in, H).cuda() self.theta = torch.nn.ModuleList([ torch.nn.Linear(self.dTree.D_in, D_out) for i in range(self.dTree.nNodes) ]).cuda() self.sigmoid = torch.nn.Sigmoid().cuda()
def create_dt_classifier(data, depth, tree): # print(data.shape, " Depth : ", depth) if depth == 0: prediction = ada_get_leaf_prediction_value(data) tree.insert(None, None, prediction, True) return u_root = ada_gini_function(data[:, 0:2]) # print(u_root) gain = 0 feature_index = 0 threshold = 0 for i in range(2, data.shape[1]): feature_index_current, gain_current, threshold_current = get_feature_gain( data, i, u_root) # print("gain_current : ", gain_current, " threshold_current : ", threshold, " feature_index_current", feature_index_current) if gain_current > gain: gain = gain_current feature_index = feature_index_current threshold = threshold_current # break # print("gain : ", gain, " threshold : ", threshold, " feature_index", feature_index) if gain == 0: prediction = ada_get_leaf_prediction_value(data) tree.insert(None, None, prediction, True) return depth = depth - 1 sorted_vals = data[np.argsort(data[:, feature_index])[::1]] val = np.split(sorted_vals, np.where(sorted_vals[:, feature_index] >= threshold)[0][:1]) # true_space = data[data[:, feature_index] >= threshold] true_space = val[1] false_space = val[0] # false_space = data[data[:, feature_index] < threshold] prediction = ada_get_leaf_prediction_value(data) tree.insert(threshold, feature_index, prediction, False) tree.left = DecisionTree() tree.right = DecisionTree() create_dt_classifier(true_space, depth, tree.left) create_dt_classifier(false_space, depth, tree.right)
def compare_algorithm(): skCount = 0 samCount = 0 data, targets, headers = get_voting() #split dataset into random parts train_data, test_data, train_target, test_target = split_data(data, targets) #reset the indexes so the dataframe can be properly parsed. train_data.reset_index(inplace=True, drop=True) test_data.reset_index(inplace=True, drop=True) train_target.reset_index(inplace=True, drop=True) test_target.reset_index(inplace=True, drop=True) #get the trees initialized samClassifier = DecisionTree() skClassifer = tree.DecisionTreeClassifier() #build trees samModel = samClassifier.fit(train_data, train_target, headers) skModel = skClassifer.fit(train_data, train_target) #get the predictions samPredicted = samModel.predict(test_data) skPredicted = skModel.predict(test_data) #this is important because this is how we can #measure the accuracy test_target = test_target[headers[-1]] #loop through the program and measure the accuracy for index in range(len(test_data)): if skPredicted[index] == test_target[index]: skCount += 1 if samPredicted[index] == test_target[index]: samCount += 1 #get the accuracy rating samAccuracy = get_accuracy(samCount, len(test_data)) skAccuracy = get_accuracy(skCount, len(test_data)) print("Sam's ID3 Accuracy: {:.2f}%. \nSK's ID3 Accuracy: {:.2f}%.".format(samAccuracy, skAccuracy))
def create_dt_classifier(data, depth, tree, m): # print("Depth : ", depth) # print("Data Shape : ", data.shape) if depth == 0: prediction = get_leaf_prediction_value(data) tree.insert(None, None, prediction, True) return u_root = gini_function(data[:, 0]) gain = 0 feature_index = 0 threshold = 0 feature_sampled_data, random_indexes = get_sampled_features(data, m) for i in range(1, feature_sampled_data.shape[1]): feature_index_current, gain_current, threshold_current = get_feature_gain(feature_sampled_data, i, u_root) if gain_current > gain: gain = gain_current feature_index = feature_index_current threshold = threshold_current if gain == 0: prediction = get_leaf_prediction_value(data) tree.insert(None, None, prediction, True) return depth = depth - 1 if random_indexes is not None: feature_index = random_indexes[feature_index] sorted_vals = data[np.argsort(data[:, feature_index])[::1]] val = np.split(sorted_vals, np.where(sorted_vals[:, feature_index] >= threshold)[0][:1]) # true_space = data[data[:, feature_index] >= threshold] true_space = val[1] false_space = val[0] # false_space = data[data[:, feature_index] < threshold] prediction = get_leaf_prediction_value(data) tree.insert(threshold, feature_index, prediction, False) tree.left = DecisionTree() tree.right = DecisionTree() create_dt_classifier(true_space, depth, tree.left, m) create_dt_classifier(false_space, depth, tree.right, m)
def main(): #import the data from a csv car_data = np.genfromtxt('car_data.csv', delimiter=',') #call the tree creator module and pass the name of the json file to it dTree = DecisionTree('jsonTrees/' + testName + '.json') scores = [] #track the best score and data best = [0, 0] test = [] for data in car_data: #change the inputs for each of the cars in the tree dTree.changeInputs(convertCarData(data)) #get the score for that car score = dTree.run() #check if its the highest if (score > best[0]): best[0] = score best[1] = data #add it to the list of scores scores.append(score) test.append([score, data.tolist()]) #print("best",best[0],best[1]) test = sorted(test, key=lambda x: x[0], reverse=True) pprint(test[0:3]) #create a normilized histagram of the scores n, bins, patches = plt.hist(scores, normed=1, facecolor='green', alpha=0.75) plt.title(testName + wValue) #save the image to a file plt.savefig("graphs/" + testName + wValue + ".png", bbox_inches='tight') #show the image plt.show()
def execute_algorithm(dataset): #we all know that this whole shell is designed just for the Decision Tree classifier = DecisionTree() #determine which dataset to retrieve if (dataset == 1): data, targets, headers = get_loans() elif (dataset == 2): data, targets, headers = get_voting() count = 0 #split dataset into random parts train_data, test_data, train_target, test_target = split_data(data, targets) #reset the indexes so the dataframe can be properly parsed. train_data.reset_index(inplace=True, drop=True) test_data.reset_index(inplace=True, drop=True) train_target.reset_index(inplace=True, drop=True) test_target.reset_index(inplace=True, drop=True) #build the tree! model = classifier.fit(train_data, train_target, headers) #prompt the user if he/she wants to display the tree print_id3(model) #target_predicted is an array of predictions that is received by the predict target_predicted = model.predict(test_data) #this allows us to know which column is the target test_target = test_target[headers[-1]] #loop through the target_predicted and count up the correct predictions for index in range(len(target_predicted)): #increment counter for every match from #target_predicted and test_target if target_predicted[index] == test_target[index]: count += 1 accuracy = get_accuracy(count, len(test_data)) #report to the user print("Accuracy: {:.2f}%".format(accuracy))
def etrims_tree(n_hidden = [1000], coef = [1000.], size=6): print_time('tree2etrims test size is %d' % size) print_time('load_etrims') train_data, train_signal, test_data, test_signal = load_etrims(size=size) num_function = 100 print_time('train_DecisionTree num function is %d' % num_function) dt = DecisionTree(num_function=num_function) dt.fit(train_data, train_signal) print_time('test_DecisionTree') score = dt.score(test_data, test_signal) print_time('score is %f' % score) print_time('DecisionTree info') dt.info() elm_hidden = [(2*size+1)*(2*size+1)*2] print_time('train_ExtremeDecisionTree elm_hidden is %d, num function is %d' % (elm_hidden[0], num_function)) edt = ExtremeDecisionTree(elm_hidden=elm_hidden, elm_coef=None, num_function=num_function) edt.fit(train_data, train_signal) print_time('test_ExtremeDecisionTree') score = edt.score(test_data, test_signal) print_time('score is %f' % score) print_time('test_ExtremeDecisionTree') score = edt.score(test_data, test_signal) print_time('score is %f' % score) print_time('ExtremeDecisionTree info') edt.info() print_time('tree2etrims test is finished !')
class DecisionTreeC45TestCase(unittest.TestCase): """ """ def setUp(self): self.decision_tree = DecisionTree("c4.5") def tearDown(self): self.decision_tree = None def test_fit(self): # test data X = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]] y = ["yes", "yes", "no", "no", "no"] # X and y is list object feat_names = ['no surfacing', 'flippers'] decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.fit(X, y, feat_names) self.assertEqual(self.decision_tree.tree, decision_tree) # X and y is array feat_names = ['no surfacing', 'flippers'] self.decision_tree.fit(np.asarray(X), np.asarray(y), feat_names) self.assertEqual(self.decision_tree.tree, decision_tree) def test_predict(self): # There is no need to test predict. # Because, predict is not about criterion, in test_predict. pass
def part3(): points = {} plt.figure() for trainFileName, testFileName, key in [ ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'), ('../heart_train.arff', '../heart_test.arff', 'heart') ]: attribute, trainset = data_provider(trainFileName) testAttribute, testset = data_provider(testFileName) root = TreeNode(trainset, attribute) curTree = DecisionTree(root) points = [] for m in (2, 5, 10, 20): curTree.createTree(root, m) trueSamples = 0 falseSamples = 0 for instance in testset: if curTree.predict(root, instance) == instance[-1]: trueSamples += 1 else: falseSamples += 1 points.append( [m, float(trueSamples) / (trueSamples + falseSamples)]) mapping = {'diabetes': 1, 'heart': 2} for x, y in points: ax = plt.subplot(2, 1, mapping[key]) ax.set_xlim(0, 22) ax.set_ylim(0.6, 0.8) ax.set_ylabel('accuracy') ax.set_title(key) plt.annotate('%.3f' % y, xy=(x - 0.02, y + 0.02)) plt.annotate('m=%d' % x, xy=(x - 0.02, y - 0.07)) ax.plot(x, y, 'o-') plt.xlabel('tree number m') plt.savefig('../part3.pdf')
def mnist_mlelm(n_hidden=[1000]): print "hidden:", n_hidden # initialize train_set, valid_set, test_set = load_mnist() train_data, train_target = train_set valid_data, valid_target = valid_set test_data, test_target = test_set # size train_size = 500 # max 50000 valid_size = 10 # max 10000 test_size = 10 # max 10000 train_data, train_target = train_data[:train_size], train_target[:train_size] valid_data, valid_target = valid_data[:valid_size], valid_target[:valid_size] test_data, test_target = test_data[:test_size], test_target[:test_size] # add valid_data/target to train_data/target """ train_data = train_data + valid_data train_target = train_target + valid_target """ # model dt = DecisionTree() #""" edt1 = ExtremeDecisionTree(elm_hidden=n_hidden) edt2 = ExtremeDecisionTree(elm_hidden=n_hidden, elm_coef=[1000., 100., 1000.]) #""" # fit #print "fitting ..." dt.fit(train_data, train_target) #""" edt1.fit(train_data, train_target) edt2.fit(train_data, train_target) #""" # test print "test score is ", score_dt = dt.score(test_data, test_target) #""" score_edt1 = edt1.score(test_data, test_target) score_edt2 = edt2.score(test_data, test_target) print score_dt, score_edt1, score_edt2 #""" #print score_dt print "dt" dt.info() #""" print "edt1" edt1.info() print "edt2" edt2.info()
class treeNet(torch.nn.Module): def __init__(self, x, y, depth=2): """ In the constructor we instantiate nn.Linear modules and assign them as member variables. """ H = 100 D_out = 1 self.dTree = DecisionTree(x, y, idxs=range(len(y)), depth=2) super(treeNet, self).__init__() self.linear1 = torch.nn.Linear(self.dTree.D_in, H).cuda() self.theta = torch.nn.ModuleList([ torch.nn.Linear(self.dTree.D_in, D_out) for i in range(self.dTree.nNodes) ]).cuda() self.sigmoid = torch.nn.Sigmoid().cuda() def forward(self, x, idxs=None): """ In the forward function we accept a Tensor of input data and we must return a Tensor of output data. We can use Modules defined in the constructor as well as arbitrary operators on Tensors. """ # if phase is 'train': # self.dTree.mu = self.dTree.mu_train # self.dTree.pi = self.dTree.iter_pi(self.dTree.P,self.dTree.pi,self.dTree.mu) # elif phase is 'val': # self.dTree.mu = self.dTree.mu_val if idxs is None: idxs = range(len(x)) # h = self.linear1(x.float()).clamp(min=0).cuda() # print(f'h {h}') y_pred = self.dTree.plant(x, self.theta, idxs=idxs) # _, y_pred = y_pred_onehot.max(1)#convert from one-hot encoding to vector return y_pred
def calc_misclassification_rate(training_dataframe, validation_dataframe, criterion): err = 0 x = training_dataframe[categorical_columns] y = training_dataframe['num'] dt = DecisionTree(criterion) dt.fit(x, y) dt.prune( validation_dataframe.loc[:, validation_dataframe.columns != "num"], validation_dataframe.loc[:, "num"]) for i in validation_dataframe.index: if (dt.root.evaluate(validation_dataframe.loc[ i, validation_dataframe.columns != "num"]) != validation_dataframe.loc[i, "num"]): err += 1 err = err / len(validation_dataframe) print((err, dt)) return (err, dt) gini_trees = calc_misclassification_rate(criterion="gini") gtree = max(gini_trees, key=lambda x: x[0])[1] print("best gini tree = {}".format(gtree)) Gg = Digraph("", filename="tree_gini.pdf") gtree.plot(Gg) Gg.view() entropy_trees = calc_misclassification_rate(criterion="entropy") etree = max(entropy_trees, key=lambda x: x[0])[1] print("best entropy tree = {}".format(etree)) Ge = Digraph("", filename="tree_entropy.pdf") etree.plot(Ge) Ge.view() fig, ax = plt.subplots(nrows=1, ncols=1) clf = tree.DecisionTreeClassifier(criterion="entropy") clf = clf.fit(categorical_features, df.num) tree.plot_tree(clf, ax=ax) plt.savefig("sklearn_entropy") plt.show() fig, ax = plt.subplots(nrows=1, ncols=1) clf = tree.DecisionTreeClassifier(criterion="gini") clf = clf.fit(categorical_features, df.num) tree.plot_tree(clf, ax=ax) plt.savefig("sklearn_gini") plt.show()
class DecisionTreeTestCase(unittest.TestCase): """Unittest for tree.DecsionTree """ def setUp(self): self.decision_tree = DecisionTree() def tearDown(self): self.decision_tree = None def test_fit(self): # test data X = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]] y = ["yes", "yes", "no", "no", "no"] # X and y is list object feat_names = ['no surfacing', 'flippers'] decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.fit(X, y, feat_names) self.assertEqual(self.decision_tree.tree, decision_tree) # X and y is array feat_names = ['no surfacing', 'flippers'] self.decision_tree.fit(np.asarray(X), np.asarray(y), feat_names) self.assertEqual(self.decision_tree.tree, decision_tree) def test_predict(self): # test 1: training data item = [1, 0] feat_names = ['no surfacing', 'flippers'] result = 'no' decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.tree = decision_tree self.assertEqual(result, self.decision_tree.predict(item, feat_names)) # test 2: training data with different feat_names dataset = [[0, 1], [0, 0]] feat_names = ['flippers', 'no surfacing'] result = ["no", "no"] decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.tree = decision_tree self.assertEqual(result, self.decision_tree.predict(dataset, feat_names))
from tree import DecisionTree #Train dataset X = np.loadtxt('train_data') y = np.loadtxt('train_labels') X, y = shuffle(X, y) #Data normalization X -= X.min() X /= X.max() #Instanciation tree = DecisionTree() #Training tree.train(X_train, y_train) #Test dataset X = np.loadtxt('test_data') y = np.loadtxt('test_labels') X, y = shuffle(X, y) #Data normalization X -= X.min() X /= X.max()
from tree import DecisionTree training_data = [ ['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Red', 1, 'Grape'], ['Red', 1, 'Grape'], ['Yellow', 3, 'Lemon'], ] decison_tree = DecisionTree() tree = decison_tree.build_tree(training_data) decison_tree.print_tree(tree) def pretty_print_leaf_predictions(counts): total = sum(counts.values()) * 1.0 probabilities = {} for label in counts.keys(): probabilities[label] = str(int(counts[label] / total * 100)) + "%" return probabilities pretty_print_leaf_predictions(decison_tree.classify(training_data[0], tree)) testing_data = [ ['Green', 3, 'Apple'], ['Yellow', 4, 'Apple'], ['Red', 2, 'Grape'],
ts = time.time() D = np.full((len(t_data)), 1 / len(t_data)) print(D) ada_test_data = np.insert(t_data, 1, D, axis=1) ada_val_data = np.insert(v_data, 1, np.full(len(v_data), 1), axis=1) # print(ada_test_data) for j in range(0, len(l_list)): print("L : ", l_list[j]) tree_list = [] alpha_list = [] ada_test_data = np.insert(t_data, 1, D, axis=1) # np.zeros((2, 1)) for l in range(0, l_list[j]): tree = DecisionTree() create_dt_classifier(ada_test_data, d, tree) tree_list.append(tree) alpha = get_params(ada_test_data, tree, d) alpha_list.append(alpha) print(ada_test_data[:, 1]) # print(ada_test_data) train_accuracy = check_accuracy_with_trees(ada_test_data, tree_list, d, alpha_list) train_accuracy_list.append(train_accuracy) val_accuracy = check_accuracy_with_trees(ada_val_data, tree_list, d, alpha_list) val_accuracy_list.append(val_accuracy) iterations.append(l_list[j]) # print(train_accuracy)
def main(): fmemFile = File("fmemFile.csv") #import the data from a csv car_data = np.genfromtxt('car_data.csv', delimiter=',') #call the tree creator module and pass the name of the json file to it aTree = DecisionTree('jsonTrees/' + testA + '.json') nTree = DecisionTree('jsonTrees/' + testN + '.json') eTree = DecisionTree('jsonTrees/' + testE + '.json') #iterator = car_data[np.random.randint(car_data.shape[0], size=100), :] #iterator = car_data iterator = inputs2 for inputs in iterator: #change the inputs for each of the cars in the tree #inputs = convertCarData(inputs) aTree.changeInputs(inputs) nTree.changeInputs(inputs) eTree.changeInputs(inputs) #get the score for that car aScore = aTree.run() nScore = nTree.run() eScore = eTree.run() print("Inputs:", inputs) print("ASCORE #######:", aScore) print("NSCORE #######:", nScore) print("ESCORE #######:", eScore) eScore = np.array(eScore) f1 = MemFunc('trap', aScore) X = np.arange(0, 1, .05) l1, = plt.plot(X, [f1.memFunc(i) for i in X], c='r', linewidth=2.0, label="AlphaCuts") l2, = plt.plot(eScore[:, 0], eScore[:, 1], c='b', linewidth=2.0, label="Extention Principle") l3 = plt.axvline(nScore, c='g', linewidth=2.0, label="Crisp") plt.legend(handles=[l1, l2, l3]) plt.title("Regular Title") plt.xlabel("Output Score") plt.ylabel("Membership Value") #Batch Save Rember to remove input #plt.savefig("test.png") plt.show() break
import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split, KFold from sklearn.metrics import accuracy_score from pprint import pprint #导入数据 data = pd.read_table('Font_dataset.txt', header=None, sep=',') #特征数据和标签 X = data.drop(4, axis=1) y = data[4] from tree import DecisionTree clf = DecisionTree() print(u"*****在自己的决策树上进行10折交叉验证*****") test_accuracy = [] L = X.shape[0] kf = KFold(L, n_folds=10, random_state=2018) count = 0 for train_index, test_index in kf: count += 1 X_train, X_test = X.values[train_index], X.values[test_index] y_train, y_test = y.values[train_index], y.values[test_index] #训练 clf.fit(X.values, y.values) #测试 test_pre = clf.predict(X_test) test_acc = accuracy_score(y_test, test_pre)
from sklearn.cross_validation import train_test_split from sklearn import metrics import numpy as np from tree import DecisionTree # load data X = np.loadtxt('../feature/5grams_count_mc_features') y = np.loadtxt('../data/tag_mc') X -= X.min() X /= X.max() X_train, X_test, y_train, y_test = train_test_split(X, y) tree = DecisionTree() tree.train(X_train, y_train) expected = y_test predicted = tree.predict(X_test) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted))
def printInputs(): dTree = DecisionTree('jsonTrees/' + testName + '.json') dTree.printInputs()
from tree import DecisionTree from iris_dataset import vectors, labels N = int(len(vectors)*0.8) training_vectors = vectors[:N] training_labels = labels[:N] test_vectors = vectors[N:] test_labels = labels[N:] tree = DecisionTree(leaf_size=1, n_trials=1) tree.fit(training_vectors, training_labels) results = tree.predict(test_vectors) tree.show() print("results:{}".format(results)) print("answers:{}".format(test_labels))
def main(): # Steps to build and prune a decision tree: # 1. Prepare dataset. headings, dataset = utils.load_dataset() random.shuffle(dataset) # Split the dataset into training data, test data and pruning data if needed. train_data = dataset[:32000] test_data = dataset[32000:40000] # prune_data = dataset[:] # 2. Grow a decision tree from training data based on entropy or gini. dt = DecisionTree.build_tree(train_data, DecisionTree.entropy) # dt = DecisionTree.build_tree(train_data, DecisionTree.gini) # 3. Visualize the tree. DecisionTree.plot_tree(dt, headings, conf.org_tree_filepath) leaves = DecisionTree.count_leaves(dt) print('Leaves count before pruning: %d' % leaves) # 4. Run the test data through the tree. err = DecisionTree.evaluate(dt, test_data) print('Accuracy before pruning: %d/%d = %f' % \ (len(test_data) - err, len(test_data), (len(test_data) - err) / len(test_data))) # 5. Prune the tree. # 5.1 REP: REP requires another dataset for pruning, so we need to split the dataset in a different way. # 5.2 PP: top-down DecisionTree.top_down_pessimistic_pruning(dt) # 5.3 PP: bottom-up. # DecisionTree.bottom_up_pessimistic_pruning(dt) # 5.4 MEP # DecisionTree.minimum_error_pruning(dt) # 6. Visualize the pruned tree. DecisionTree.plot_tree(dt, headings, conf.prn_tree_filepath) leaves = DecisionTree.count_leaves(dt) print('Leaves count after pruning: %d' % leaves) # 7. Check if the classification ability is improved after pruning. err = DecisionTree.evaluate(dt, test_data) print('Accuracy after pruning: %d/%d = %f' % \ (len(test_data) - err, len(test_data), (len(test_data) - err) / len(test_data)))
def setUp(self): self.decision_tree = DecisionTree("c4.5")
sys.exit() trainFileName = sys.argv[1] testFileName = sys.argv[2] try: m = int(sys.argv[3]) except: print >> sys.stderr, "[ERROR] [m] should be in integer!" sys.exit() attribute, trainset = data_provider(trainFileName) testAttribute, testset = data_provider(testFileName) try: assert (testAttribute == attribute) except AssertionError: print >> sys.stderr, "[ERROR] pls check the attributes of test data." sys.exit() # train root = TreeNode(trainset, attribute) curTree = DecisionTree(root) curTree.createTree(root, m) curTree.printTree(root, 0) # test print '<Predictions for the Test Set Instances>' index = 1 for instance in testset: print '{}: Actual: {} Predicted: {}'.format( index, instance[-1], curTree.predict(root, instance)) index += 1
from sklearn.preprocessing import LabelEncoder from tree import DecisionTree import pandas as pd import numpy as np if __name__ == '__main__': train_df = pd.read_csv('/app/data/train.csv') le_sex = LabelEncoder() le_sex.fit(train_df['Sex']) train_df.loc[:, 'SexInt'] = le_sex.transform(train_df['Sex']) X = np.array(train_df[['SexInt']]) y = train_df['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=71) tree = DecisionTree(max_depth=3) tree.fit(X_train, y_train) print(classification_report(y_train, tree.predict(X_train))) print(classification_report(y_test, tree.predict(X_test))) # tree.make_graph() s_tree = DecisionTreeClassifier(max_depth=3) s_tree.fit(X_train, y_train) print(classification_report(y_train, s_tree.predict(X_train))) print(classification_report(y_test, s_tree.predict(X_test))) s_tree.predict_proba(X_test)
def main(): X, y = read_data('crx.data.txt') n_samples = X.shape[0] n_folds = 3 n_samples_per_fold = n_samples / n_folds cum_accuracy = 0.0 cum_p = 0.0 cum_r = 0.0 fold = 0 """ clf = DecisionTree(maxdepth=3) clf.fit(X, y) clf.print_tree() y_pred = clf.predict(X) print y.astype(np.int32) return """ for train_idx, test_idx in kfold(n_samples, n_folds): print "Fold", fold fold += 1 X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] clf = DecisionTree(maxdepth=3) clf.fit(X_train, y_train) #clf.print_tree() y_pred = clf.predict(X_test) # TP, FP, TN and FN tp = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 1 and y_test[i] == 1]) tn = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 0 and y_test[i] == 0]) fp = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 1 and y_test[i] == 0]) fn = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 0 and y_test[i] == 1]) # accuracy for this fold acc = float(tp + tn)/(tp + tn + fp + fn) cum_accuracy += acc print "\tAccuracy:", acc # precision, recall try: p = float(tp) / (tp + fp) r = float(tp) / (tp + fn) cum_p += p cum_r += r f1 = 2 * p * r / (p + r) print "\tPrecision:", p print "\tRecall:", r print "\tF1:", f1 except: # divide by zero pass print print "Average accuracy:", cum_accuracy/n_folds print "Average precision:", cum_p/n_folds print "Average recall:", cum_r/n_folds """