def __init__(self, n_trees, tree_config=None): # By default we want some randomness in the trees default_tree_config = dict(cut_dim="random_best") tree_config = {**default_tree_config, **(tree_config or {})} self.trees = [DecisionTree(**tree_config) for i in range(n_trees)]
def create_tree(_dataset): ''' :param _dataset: dataset that will be assigned to the root of the tree :return: return a DecisionTree object (not trained) ''' return DecisionTree(_dataset)
def part1(t_data, v_data): tree = DecisionTree() tree_list = [tree] ts = time.time() create_dt_classifier(t_data, 9, tree, 0) print("Completed ", " Time : ", (time.time() - ts)) train_accuracy_list = [] val_accuracy_list = [] iterations = [] for i in range(0, 10): train_accuracy = check_accuracy_with_trees(t_data, tree_list, i) train_accuracy_list.append(train_accuracy) val_accuracy = check_accuracy_with_trees(v_data, tree_list, i) val_accuracy_list.append(val_accuracy) iterations.append(i) accuracy = [train_accuracy_list, val_accuracy_list] iters = [iterations, iterations] print("Completed ", " Time : ", (time.time() - ts)) print(accuracy) legends = ["Training", "Validation"] labels = ["Accuracy in %", "Depth"] plot(iters, accuracy, "Accuracy Vs Depth", legends, labels)
def create_dt_classifier(data, depth, tree, m): # print("Depth : ", depth) # print("Data Shape : ", data.shape) if depth == 0: prediction = get_leaf_prediction_value(data) tree.insert(None, None, prediction, True) return u_root = gini_function(data[:, 0]) gain = 0 feature_index = 0 threshold = 0 feature_sampled_data, random_indexes = get_sampled_features(data, m) for i in range(1, feature_sampled_data.shape[1]): feature_index_current, gain_current, threshold_current = get_feature_gain(feature_sampled_data, i, u_root) if gain_current > gain: gain = gain_current feature_index = feature_index_current threshold = threshold_current if gain == 0: prediction = get_leaf_prediction_value(data) tree.insert(None, None, prediction, True) return depth = depth - 1 if random_indexes is not None: feature_index = random_indexes[feature_index] sorted_vals = data[np.argsort(data[:, feature_index])[::1]] val = np.split(sorted_vals, np.where(sorted_vals[:, feature_index] >= threshold)[0][:1]) # true_space = data[data[:, feature_index] >= threshold] true_space = val[1] false_space = val[0] # false_space = data[data[:, feature_index] < threshold] prediction = get_leaf_prediction_value(data) tree.insert(threshold, feature_index, prediction, False) tree.left = DecisionTree() tree.right = DecisionTree() create_dt_classifier(true_space, depth, tree.left, m) create_dt_classifier(false_space, depth, tree.right, m)
def create_dt_classifier(data, depth, tree): # print(data.shape, " Depth : ", depth) if depth == 0: prediction = ada_get_leaf_prediction_value(data) tree.insert(None, None, prediction, True) return u_root = ada_gini_function(data[:, 0:2]) # print(u_root) gain = 0 feature_index = 0 threshold = 0 for i in range(2, data.shape[1]): feature_index_current, gain_current, threshold_current = get_feature_gain( data, i, u_root) # print("gain_current : ", gain_current, " threshold_current : ", threshold, " feature_index_current", feature_index_current) if gain_current > gain: gain = gain_current feature_index = feature_index_current threshold = threshold_current # break # print("gain : ", gain, " threshold : ", threshold, " feature_index", feature_index) if gain == 0: prediction = ada_get_leaf_prediction_value(data) tree.insert(None, None, prediction, True) return depth = depth - 1 sorted_vals = data[np.argsort(data[:, feature_index])[::1]] val = np.split(sorted_vals, np.where(sorted_vals[:, feature_index] >= threshold)[0][:1]) # true_space = data[data[:, feature_index] >= threshold] true_space = val[1] false_space = val[0] # false_space = data[data[:, feature_index] < threshold] prediction = ada_get_leaf_prediction_value(data) tree.insert(threshold, feature_index, prediction, False) tree.left = DecisionTree() tree.right = DecisionTree() create_dt_classifier(true_space, depth, tree.left) create_dt_classifier(false_space, depth, tree.right)
def part2(): """randomly choose 5%, 10%, 20%, 50%, 100% samples to train, and choose 10 sets each time""" plt.figure() for trainFileName, testFileName, key in [ ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'), ('../heart_train.arff', '../heart_test.arff', 'heart') ]: attribute, trainset = data_provider(trainFileName) testAttribute, testset = data_provider(testFileName) m = 4 avgPoints = [] maxPoints = [] minPoints = [] for rate in (0.05, 0.1, 0.2, 0.5, 1): accuracys = [] for newTrainset in selectSample(trainset, rate): root = TreeNode(newTrainset, attribute) curTree = DecisionTree(root) curTree.createTree(root, m) trueSamples = 0 falseSamples = 0 for instance in testset: if curTree.predict(root, instance) == instance[-1]: trueSamples += 1 else: falseSamples += 1 accuracys.append( float(trueSamples) / (trueSamples + falseSamples)) accuracy = float(sum(accuracys)) / len(accuracys) avgPoints.append([int(rate * 100), accuracy]) maxPoints.append([int(rate * 100), max(accuracys)]) minPoints.append([int(rate * 100), min(accuracys)]) mapping = {'diabetes': 1, 'heart': 2} ax = plt.subplot(1, 2, mapping[key]) ax.set_xlim(0, 105) ax.set_ylim(0.45, 0.9) ax.set_ylabel('accuracy') ax.set_title(key) ax.plot([x[0] for x in avgPoints], [x[1] for x in avgPoints], label='average') ax.plot([x[0] for x in maxPoints], [x[1] for x in maxPoints], label='maximum') ax.plot([x[0] for x in minPoints], [x[1] for x in minPoints], label='minimum') ax.legend() plt.xlabel('dataset sample percentage') plt.savefig('../part2.pdf')
def __init__(self, x, y, depth=2): """ In the constructor we instantiate nn.Linear modules and assign them as member variables. """ H = 100 D_out = 1 self.dTree = DecisionTree(x, y, idxs=range(len(y)), depth=2) super(treeNet, self).__init__() self.linear1 = torch.nn.Linear(self.dTree.D_in, H).cuda() self.theta = torch.nn.ModuleList([ torch.nn.Linear(self.dTree.D_in, D_out) for i in range(self.dTree.nNodes) ]).cuda() self.sigmoid = torch.nn.Sigmoid().cuda()
def calc_misclassification_rate(training_dataframe, validation_dataframe, criterion): err = 0 x = training_dataframe[categorical_columns] y = training_dataframe['num'] dt = DecisionTree(criterion) dt.fit(x, y) dt.prune( validation_dataframe.loc[:, validation_dataframe.columns != "num"], validation_dataframe.loc[:, "num"]) for i in validation_dataframe.index: if (dt.root.evaluate(validation_dataframe.loc[ i, validation_dataframe.columns != "num"]) != validation_dataframe.loc[i, "num"]): err += 1 err = err / len(validation_dataframe) print((err, dt)) return (err, dt) gini_trees = calc_misclassification_rate(criterion="gini") gtree = max(gini_trees, key=lambda x: x[0])[1] print("best gini tree = {}".format(gtree)) Gg = Digraph("", filename="tree_gini.pdf") gtree.plot(Gg) Gg.view() entropy_trees = calc_misclassification_rate(criterion="entropy") etree = max(entropy_trees, key=lambda x: x[0])[1] print("best entropy tree = {}".format(etree)) Ge = Digraph("", filename="tree_entropy.pdf") etree.plot(Ge) Ge.view() fig, ax = plt.subplots(nrows=1, ncols=1) clf = tree.DecisionTreeClassifier(criterion="entropy") clf = clf.fit(categorical_features, df.num) tree.plot_tree(clf, ax=ax) plt.savefig("sklearn_entropy") plt.show() fig, ax = plt.subplots(nrows=1, ncols=1) clf = tree.DecisionTreeClassifier(criterion="gini") clf = clf.fit(categorical_features, df.num) tree.plot_tree(clf, ax=ax) plt.savefig("sklearn_gini") plt.show()
def compare_algorithm(): skCount = 0 samCount = 0 data, targets, headers = get_voting() #split dataset into random parts train_data, test_data, train_target, test_target = split_data(data, targets) #reset the indexes so the dataframe can be properly parsed. train_data.reset_index(inplace=True, drop=True) test_data.reset_index(inplace=True, drop=True) train_target.reset_index(inplace=True, drop=True) test_target.reset_index(inplace=True, drop=True) #get the trees initialized samClassifier = DecisionTree() skClassifer = tree.DecisionTreeClassifier() #build trees samModel = samClassifier.fit(train_data, train_target, headers) skModel = skClassifer.fit(train_data, train_target) #get the predictions samPredicted = samModel.predict(test_data) skPredicted = skModel.predict(test_data) #this is important because this is how we can #measure the accuracy test_target = test_target[headers[-1]] #loop through the program and measure the accuracy for index in range(len(test_data)): if skPredicted[index] == test_target[index]: skCount += 1 if samPredicted[index] == test_target[index]: samCount += 1 #get the accuracy rating samAccuracy = get_accuracy(samCount, len(test_data)) skAccuracy = get_accuracy(skCount, len(test_data)) print("Sam's ID3 Accuracy: {:.2f}%. \nSK's ID3 Accuracy: {:.2f}%.".format(samAccuracy, skAccuracy))
def execute_algorithm(dataset): #we all know that this whole shell is designed just for the Decision Tree classifier = DecisionTree() #determine which dataset to retrieve if (dataset == 1): data, targets, headers = get_loans() elif (dataset == 2): data, targets, headers = get_voting() count = 0 #split dataset into random parts train_data, test_data, train_target, test_target = split_data(data, targets) #reset the indexes so the dataframe can be properly parsed. train_data.reset_index(inplace=True, drop=True) test_data.reset_index(inplace=True, drop=True) train_target.reset_index(inplace=True, drop=True) test_target.reset_index(inplace=True, drop=True) #build the tree! model = classifier.fit(train_data, train_target, headers) #prompt the user if he/she wants to display the tree print_id3(model) #target_predicted is an array of predictions that is received by the predict target_predicted = model.predict(test_data) #this allows us to know which column is the target test_target = test_target[headers[-1]] #loop through the target_predicted and count up the correct predictions for index in range(len(target_predicted)): #increment counter for every match from #target_predicted and test_target if target_predicted[index] == test_target[index]: count += 1 accuracy = get_accuracy(count, len(test_data)) #report to the user print("Accuracy: {:.2f}%".format(accuracy))
def main(): #import the data from a csv car_data = np.genfromtxt('car_data.csv', delimiter=',') #call the tree creator module and pass the name of the json file to it dTree = DecisionTree('jsonTrees/' + testName + '.json') scores = [] #track the best score and data best = [0, 0] test = [] for data in car_data: #change the inputs for each of the cars in the tree dTree.changeInputs(convertCarData(data)) #get the score for that car score = dTree.run() #check if its the highest if (score > best[0]): best[0] = score best[1] = data #add it to the list of scores scores.append(score) test.append([score, data.tolist()]) #print("best",best[0],best[1]) test = sorted(test, key=lambda x: x[0], reverse=True) pprint(test[0:3]) #create a normilized histagram of the scores n, bins, patches = plt.hist(scores, normed=1, facecolor='green', alpha=0.75) plt.title(testName + wValue) #save the image to a file plt.savefig("graphs/" + testName + wValue + ".png", bbox_inches='tight') #show the image plt.show()
def part3(): points = {} plt.figure() for trainFileName, testFileName, key in [ ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'), ('../heart_train.arff', '../heart_test.arff', 'heart') ]: attribute, trainset = data_provider(trainFileName) testAttribute, testset = data_provider(testFileName) root = TreeNode(trainset, attribute) curTree = DecisionTree(root) points = [] for m in (2, 5, 10, 20): curTree.createTree(root, m) trueSamples = 0 falseSamples = 0 for instance in testset: if curTree.predict(root, instance) == instance[-1]: trueSamples += 1 else: falseSamples += 1 points.append( [m, float(trueSamples) / (trueSamples + falseSamples)]) mapping = {'diabetes': 1, 'heart': 2} for x, y in points: ax = plt.subplot(2, 1, mapping[key]) ax.set_xlim(0, 22) ax.set_ylim(0.6, 0.8) ax.set_ylabel('accuracy') ax.set_title(key) plt.annotate('%.3f' % y, xy=(x - 0.02, y + 0.02)) plt.annotate('m=%d' % x, xy=(x - 0.02, y - 0.07)) ax.plot(x, y, 'o-') plt.xlabel('tree number m') plt.savefig('../part3.pdf')
def setUp(self): self.decision_tree = DecisionTree("c4.5")
def main(): fmemFile = File("fmemFile.csv") #import the data from a csv car_data = np.genfromtxt('car_data.csv', delimiter=',') #call the tree creator module and pass the name of the json file to it aTree = DecisionTree('jsonTrees/' + testA + '.json') nTree = DecisionTree('jsonTrees/' + testN + '.json') eTree = DecisionTree('jsonTrees/' + testE + '.json') #iterator = car_data[np.random.randint(car_data.shape[0], size=100), :] #iterator = car_data iterator = inputs2 for inputs in iterator: #change the inputs for each of the cars in the tree #inputs = convertCarData(inputs) aTree.changeInputs(inputs) nTree.changeInputs(inputs) eTree.changeInputs(inputs) #get the score for that car aScore = aTree.run() nScore = nTree.run() eScore = eTree.run() print("Inputs:", inputs) print("ASCORE #######:", aScore) print("NSCORE #######:", nScore) print("ESCORE #######:", eScore) eScore = np.array(eScore) f1 = MemFunc('trap', aScore) X = np.arange(0, 1, .05) l1, = plt.plot(X, [f1.memFunc(i) for i in X], c='r', linewidth=2.0, label="AlphaCuts") l2, = plt.plot(eScore[:, 0], eScore[:, 1], c='b', linewidth=2.0, label="Extention Principle") l3 = plt.axvline(nScore, c='g', linewidth=2.0, label="Crisp") plt.legend(handles=[l1, l2, l3]) plt.title("Regular Title") plt.xlabel("Output Score") plt.ylabel("Membership Value") #Batch Save Rember to remove input #plt.savefig("test.png") plt.show() break
from tree import DecisionTree training_data = [ ['Green', 3, 'Apple'], ['Yellow', 3, 'Apple'], ['Red', 1, 'Grape'], ['Red', 1, 'Grape'], ['Yellow', 3, 'Lemon'], ] decison_tree = DecisionTree() tree = decison_tree.build_tree(training_data) decison_tree.print_tree(tree) def pretty_print_leaf_predictions(counts): total = sum(counts.values()) * 1.0 probabilities = {} for label in counts.keys(): probabilities[label] = str(int(counts[label] / total * 100)) + "%" return probabilities pretty_print_leaf_predictions(decison_tree.classify(training_data[0], tree)) testing_data = [ ['Green', 3, 'Apple'], ['Yellow', 4, 'Apple'], ['Red', 2, 'Grape'],
from sklearn.preprocessing import LabelEncoder from tree import DecisionTree import pandas as pd import numpy as np if __name__ == '__main__': train_df = pd.read_csv('/app/data/train.csv') le_sex = LabelEncoder() le_sex.fit(train_df['Sex']) train_df.loc[:, 'SexInt'] = le_sex.transform(train_df['Sex']) X = np.array(train_df[['SexInt']]) y = train_df['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=71) tree = DecisionTree(max_depth=3) tree.fit(X_train, y_train) print(classification_report(y_train, tree.predict(X_train))) print(classification_report(y_test, tree.predict(X_test))) # tree.make_graph() s_tree = DecisionTreeClassifier(max_depth=3) s_tree.fit(X_train, y_train) print(classification_report(y_train, s_tree.predict(X_train))) print(classification_report(y_test, s_tree.predict(X_test))) s_tree.predict_proba(X_test)
sys.exit() trainFileName = sys.argv[1] testFileName = sys.argv[2] try: m = int(sys.argv[3]) except: print >> sys.stderr, "[ERROR] [m] should be in integer!" sys.exit() attribute, trainset = data_provider(trainFileName) testAttribute, testset = data_provider(testFileName) try: assert (testAttribute == attribute) except AssertionError: print >> sys.stderr, "[ERROR] pls check the attributes of test data." sys.exit() # train root = TreeNode(trainset, attribute) curTree = DecisionTree(root) curTree.createTree(root, m) curTree.printTree(root, 0) # test print '<Predictions for the Test Set Instances>' index = 1 for instance in testset: print '{}: Actual: {} Predicted: {}'.format( index, instance[-1], curTree.predict(root, instance)) index += 1
import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split, KFold from sklearn.metrics import accuracy_score from pprint import pprint #导入数据 data = pd.read_table('Font_dataset.txt', header=None, sep=',') #特征数据和标签 X = data.drop(4, axis=1) y = data[4] from tree import DecisionTree clf = DecisionTree() print(u"*****在自己的决策树上进行10折交叉验证*****") test_accuracy = [] L = X.shape[0] kf = KFold(L, n_folds=10, random_state=2018) count = 0 for train_index, test_index in kf: count += 1 X_train, X_test = X.values[train_index], X.values[test_index] y_train, y_test = y.values[train_index], y.values[test_index] #训练 clf.fit(X.values, y.values) #测试 test_pre = clf.predict(X_test) test_acc = accuracy_score(y_test, test_pre)
def printInputs(): dTree = DecisionTree('jsonTrees/' + testName + '.json') dTree.printInputs()
from tree import DecisionTree from iris_dataset import vectors, labels N = int(len(vectors)*0.8) training_vectors = vectors[:N] training_labels = labels[:N] test_vectors = vectors[N:] test_labels = labels[N:] tree = DecisionTree(leaf_size=1, n_trials=1) tree.fit(training_vectors, training_labels) results = tree.predict(test_vectors) tree.show() print("results:{}".format(results)) print("answers:{}".format(test_labels))
ts = time.time() D = np.full((len(t_data)), 1 / len(t_data)) print(D) ada_test_data = np.insert(t_data, 1, D, axis=1) ada_val_data = np.insert(v_data, 1, np.full(len(v_data), 1), axis=1) # print(ada_test_data) for j in range(0, len(l_list)): print("L : ", l_list[j]) tree_list = [] alpha_list = [] ada_test_data = np.insert(t_data, 1, D, axis=1) # np.zeros((2, 1)) for l in range(0, l_list[j]): tree = DecisionTree() create_dt_classifier(ada_test_data, d, tree) tree_list.append(tree) alpha = get_params(ada_test_data, tree, d) alpha_list.append(alpha) print(ada_test_data[:, 1]) # print(ada_test_data) train_accuracy = check_accuracy_with_trees(ada_test_data, tree_list, d, alpha_list) train_accuracy_list.append(train_accuracy) val_accuracy = check_accuracy_with_trees(ada_val_data, tree_list, d, alpha_list) val_accuracy_list.append(val_accuracy) iterations.append(l_list[j]) # print(train_accuracy)