def etrims_tree(n_hidden = [1000], coef = [1000.], size=6): print_time('tree2etrims test size is %d' % size) print_time('load_etrims') train_data, train_signal, test_data, test_signal = load_etrims(size=size) num_function = 100 print_time('train_DecisionTree num function is %d' % num_function) dt = DecisionTree(num_function=num_function) dt.fit(train_data, train_signal) print_time('test_DecisionTree') score = dt.score(test_data, test_signal) print_time('score is %f' % score) print_time('DecisionTree info') dt.info() elm_hidden = [(2*size+1)*(2*size+1)*2] print_time('train_ExtremeDecisionTree elm_hidden is %d, num function is %d' % (elm_hidden[0], num_function)) edt = ExtremeDecisionTree(elm_hidden=elm_hidden, elm_coef=None, num_function=num_function) edt.fit(train_data, train_signal) print_time('test_ExtremeDecisionTree') score = edt.score(test_data, test_signal) print_time('score is %f' % score) print_time('test_ExtremeDecisionTree') score = edt.score(test_data, test_signal) print_time('score is %f' % score) print_time('ExtremeDecisionTree info') edt.info() print_time('tree2etrims test is finished !')
def mnist_mlelm(n_hidden=[1000]): print "hidden:", n_hidden # initialize train_set, valid_set, test_set = load_mnist() train_data, train_target = train_set valid_data, valid_target = valid_set test_data, test_target = test_set # size train_size = 500 # max 50000 valid_size = 10 # max 10000 test_size = 10 # max 10000 train_data, train_target = train_data[:train_size], train_target[:train_size] valid_data, valid_target = valid_data[:valid_size], valid_target[:valid_size] test_data, test_target = test_data[:test_size], test_target[:test_size] # add valid_data/target to train_data/target """ train_data = train_data + valid_data train_target = train_target + valid_target """ # model dt = DecisionTree() #""" edt1 = ExtremeDecisionTree(elm_hidden=n_hidden) edt2 = ExtremeDecisionTree(elm_hidden=n_hidden, elm_coef=[1000., 100., 1000.]) #""" # fit #print "fitting ..." dt.fit(train_data, train_target) #""" edt1.fit(train_data, train_target) edt2.fit(train_data, train_target) #""" # test print "test score is ", score_dt = dt.score(test_data, test_target) #""" score_edt1 = edt1.score(test_data, test_target) score_edt2 = edt2.score(test_data, test_target) print score_dt, score_edt1, score_edt2 #""" #print score_dt print "dt" dt.info() #""" print "edt1" edt1.info() print "edt2" edt2.info()
def calc_misclassification_rate(training_dataframe, validation_dataframe, criterion): err = 0 x = training_dataframe[categorical_columns] y = training_dataframe['num'] dt = DecisionTree(criterion) dt.fit(x, y) dt.prune( validation_dataframe.loc[:, validation_dataframe.columns != "num"], validation_dataframe.loc[:, "num"]) for i in validation_dataframe.index: if (dt.root.evaluate(validation_dataframe.loc[ i, validation_dataframe.columns != "num"]) != validation_dataframe.loc[i, "num"]): err += 1 err = err / len(validation_dataframe) print((err, dt)) return (err, dt) gini_trees = calc_misclassification_rate(criterion="gini") gtree = max(gini_trees, key=lambda x: x[0])[1] print("best gini tree = {}".format(gtree)) Gg = Digraph("", filename="tree_gini.pdf") gtree.plot(Gg) Gg.view() entropy_trees = calc_misclassification_rate(criterion="entropy") etree = max(entropy_trees, key=lambda x: x[0])[1] print("best entropy tree = {}".format(etree)) Ge = Digraph("", filename="tree_entropy.pdf") etree.plot(Ge) Ge.view() fig, ax = plt.subplots(nrows=1, ncols=1) clf = tree.DecisionTreeClassifier(criterion="entropy") clf = clf.fit(categorical_features, df.num) tree.plot_tree(clf, ax=ax) plt.savefig("sklearn_entropy") plt.show() fig, ax = plt.subplots(nrows=1, ncols=1) clf = tree.DecisionTreeClassifier(criterion="gini") clf = clf.fit(categorical_features, df.num) tree.plot_tree(clf, ax=ax) plt.savefig("sklearn_gini") plt.show()
class DecisionTreeC45TestCase(unittest.TestCase): """ """ def setUp(self): self.decision_tree = DecisionTree("c4.5") def tearDown(self): self.decision_tree = None def test_fit(self): # test data X = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]] y = ["yes", "yes", "no", "no", "no"] # X and y is list object feat_names = ['no surfacing', 'flippers'] decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.fit(X, y, feat_names) self.assertEqual(self.decision_tree.tree, decision_tree) # X and y is array feat_names = ['no surfacing', 'flippers'] self.decision_tree.fit(np.asarray(X), np.asarray(y), feat_names) self.assertEqual(self.decision_tree.tree, decision_tree) def test_predict(self): # There is no need to test predict. # Because, predict is not about criterion, in test_predict. pass
def compare_algorithm(): skCount = 0 samCount = 0 data, targets, headers = get_voting() #split dataset into random parts train_data, test_data, train_target, test_target = split_data(data, targets) #reset the indexes so the dataframe can be properly parsed. train_data.reset_index(inplace=True, drop=True) test_data.reset_index(inplace=True, drop=True) train_target.reset_index(inplace=True, drop=True) test_target.reset_index(inplace=True, drop=True) #get the trees initialized samClassifier = DecisionTree() skClassifer = tree.DecisionTreeClassifier() #build trees samModel = samClassifier.fit(train_data, train_target, headers) skModel = skClassifer.fit(train_data, train_target) #get the predictions samPredicted = samModel.predict(test_data) skPredicted = skModel.predict(test_data) #this is important because this is how we can #measure the accuracy test_target = test_target[headers[-1]] #loop through the program and measure the accuracy for index in range(len(test_data)): if skPredicted[index] == test_target[index]: skCount += 1 if samPredicted[index] == test_target[index]: samCount += 1 #get the accuracy rating samAccuracy = get_accuracy(samCount, len(test_data)) skAccuracy = get_accuracy(skCount, len(test_data)) print("Sam's ID3 Accuracy: {:.2f}%. \nSK's ID3 Accuracy: {:.2f}%.".format(samAccuracy, skAccuracy))
def execute_algorithm(dataset): #we all know that this whole shell is designed just for the Decision Tree classifier = DecisionTree() #determine which dataset to retrieve if (dataset == 1): data, targets, headers = get_loans() elif (dataset == 2): data, targets, headers = get_voting() count = 0 #split dataset into random parts train_data, test_data, train_target, test_target = split_data(data, targets) #reset the indexes so the dataframe can be properly parsed. train_data.reset_index(inplace=True, drop=True) test_data.reset_index(inplace=True, drop=True) train_target.reset_index(inplace=True, drop=True) test_target.reset_index(inplace=True, drop=True) #build the tree! model = classifier.fit(train_data, train_target, headers) #prompt the user if he/she wants to display the tree print_id3(model) #target_predicted is an array of predictions that is received by the predict target_predicted = model.predict(test_data) #this allows us to know which column is the target test_target = test_target[headers[-1]] #loop through the target_predicted and count up the correct predictions for index in range(len(target_predicted)): #increment counter for every match from #target_predicted and test_target if target_predicted[index] == test_target[index]: count += 1 accuracy = get_accuracy(count, len(test_data)) #report to the user print("Accuracy: {:.2f}%".format(accuracy))
class DecisionTreeTestCase(unittest.TestCase): """Unittest for tree.DecsionTree """ def setUp(self): self.decision_tree = DecisionTree() def tearDown(self): self.decision_tree = None def test_fit(self): # test data X = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]] y = ["yes", "yes", "no", "no", "no"] # X and y is list object feat_names = ['no surfacing', 'flippers'] decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.fit(X, y, feat_names) self.assertEqual(self.decision_tree.tree, decision_tree) # X and y is array feat_names = ['no surfacing', 'flippers'] self.decision_tree.fit(np.asarray(X), np.asarray(y), feat_names) self.assertEqual(self.decision_tree.tree, decision_tree) def test_predict(self): # test 1: training data item = [1, 0] feat_names = ['no surfacing', 'flippers'] result = 'no' decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.tree = decision_tree self.assertEqual(result, self.decision_tree.predict(item, feat_names)) # test 2: training data with different feat_names dataset = [[0, 1], [0, 0]] feat_names = ['flippers', 'no surfacing'] result = ["no", "no"] decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.tree = decision_tree self.assertEqual(result, self.decision_tree.predict(dataset, feat_names))
from sklearn.preprocessing import LabelEncoder from tree import DecisionTree import pandas as pd import numpy as np if __name__ == '__main__': train_df = pd.read_csv('/app/data/train.csv') le_sex = LabelEncoder() le_sex.fit(train_df['Sex']) train_df.loc[:, 'SexInt'] = le_sex.transform(train_df['Sex']) X = np.array(train_df[['SexInt']]) y = train_df['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=71) tree = DecisionTree(max_depth=3) tree.fit(X_train, y_train) print(classification_report(y_train, tree.predict(X_train))) print(classification_report(y_test, tree.predict(X_test))) # tree.make_graph() s_tree = DecisionTreeClassifier(max_depth=3) s_tree.fit(X_train, y_train) print(classification_report(y_train, s_tree.predict(X_train))) print(classification_report(y_test, s_tree.predict(X_test))) s_tree.predict_proba(X_test)
from tree import DecisionTree from iris_dataset import vectors, labels N = int(len(vectors)*0.8) training_vectors = vectors[:N] training_labels = labels[:N] test_vectors = vectors[N:] test_labels = labels[N:] tree = DecisionTree(leaf_size=1, n_trials=1) tree.fit(training_vectors, training_labels) results = tree.predict(test_vectors) tree.show() print("results:{}".format(results)) print("answers:{}".format(test_labels))
def main(): X, y = read_data('crx.data.txt') n_samples = X.shape[0] n_folds = 3 n_samples_per_fold = n_samples / n_folds cum_accuracy = 0.0 cum_p = 0.0 cum_r = 0.0 fold = 0 """ clf = DecisionTree(maxdepth=3) clf.fit(X, y) clf.print_tree() y_pred = clf.predict(X) print y.astype(np.int32) return """ for train_idx, test_idx in kfold(n_samples, n_folds): print "Fold", fold fold += 1 X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] clf = DecisionTree(maxdepth=3) clf.fit(X_train, y_train) #clf.print_tree() y_pred = clf.predict(X_test) # TP, FP, TN and FN tp = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 1 and y_test[i] == 1]) tn = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 0 and y_test[i] == 0]) fp = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 1 and y_test[i] == 0]) fn = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 0 and y_test[i] == 1]) # accuracy for this fold acc = float(tp + tn)/(tp + tn + fp + fn) cum_accuracy += acc print "\tAccuracy:", acc # precision, recall try: p = float(tp) / (tp + fp) r = float(tp) / (tp + fn) cum_p += p cum_r += r f1 = 2 * p * r / (p + r) print "\tPrecision:", p print "\tRecall:", r print "\tF1:", f1 except: # divide by zero pass print print "Average accuracy:", cum_accuracy/n_folds print "Average precision:", cum_p/n_folds print "Average recall:", cum_r/n_folds """
from pprint import pprint #导入数据 data = pd.read_table('Font_dataset.txt', header=None, sep=',') #特征数据和标签 X = data.drop(4, axis=1) y = data[4] from tree import DecisionTree clf = DecisionTree() print(u"*****在自己的决策树上进行10折交叉验证*****") test_accuracy = [] L = X.shape[0] kf = KFold(L, n_folds=10, random_state=2018) count = 0 for train_index, test_index in kf: count += 1 X_train, X_test = X.values[train_index], X.values[test_index] y_train, y_test = y.values[train_index], y.values[test_index] #训练 clf.fit(X.values, y.values) #测试 test_pre = clf.predict(X_test) test_acc = accuracy_score(y_test, test_pre) test_accuracy.append(test_acc) print('%d test accuracy_score :%.4f' % (count, test_acc)) print('mean test accuracy_score :%.4f' % np.mean(test_accuracy))