def part2(): """randomly choose 5%, 10%, 20%, 50%, 100% samples to train, and choose 10 sets each time""" plt.figure() for trainFileName, testFileName, key in [ ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'), ('../heart_train.arff', '../heart_test.arff', 'heart') ]: attribute, trainset = data_provider(trainFileName) testAttribute, testset = data_provider(testFileName) m = 4 avgPoints = [] maxPoints = [] minPoints = [] for rate in (0.05, 0.1, 0.2, 0.5, 1): accuracys = [] for newTrainset in selectSample(trainset, rate): root = TreeNode(newTrainset, attribute) curTree = DecisionTree(root) curTree.createTree(root, m) trueSamples = 0 falseSamples = 0 for instance in testset: if curTree.predict(root, instance) == instance[-1]: trueSamples += 1 else: falseSamples += 1 accuracys.append( float(trueSamples) / (trueSamples + falseSamples)) accuracy = float(sum(accuracys)) / len(accuracys) avgPoints.append([int(rate * 100), accuracy]) maxPoints.append([int(rate * 100), max(accuracys)]) minPoints.append([int(rate * 100), min(accuracys)]) mapping = {'diabetes': 1, 'heart': 2} ax = plt.subplot(1, 2, mapping[key]) ax.set_xlim(0, 105) ax.set_ylim(0.45, 0.9) ax.set_ylabel('accuracy') ax.set_title(key) ax.plot([x[0] for x in avgPoints], [x[1] for x in avgPoints], label='average') ax.plot([x[0] for x in maxPoints], [x[1] for x in maxPoints], label='maximum') ax.plot([x[0] for x in minPoints], [x[1] for x in minPoints], label='minimum') ax.legend() plt.xlabel('dataset sample percentage') plt.savefig('../part2.pdf')
def part3(): points = {} plt.figure() for trainFileName, testFileName, key in [ ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'), ('../heart_train.arff', '../heart_test.arff', 'heart') ]: attribute, trainset = data_provider(trainFileName) testAttribute, testset = data_provider(testFileName) root = TreeNode(trainset, attribute) curTree = DecisionTree(root) points = [] for m in (2, 5, 10, 20): curTree.createTree(root, m) trueSamples = 0 falseSamples = 0 for instance in testset: if curTree.predict(root, instance) == instance[-1]: trueSamples += 1 else: falseSamples += 1 points.append( [m, float(trueSamples) / (trueSamples + falseSamples)]) mapping = {'diabetes': 1, 'heart': 2} for x, y in points: ax = plt.subplot(2, 1, mapping[key]) ax.set_xlim(0, 22) ax.set_ylim(0.6, 0.8) ax.set_ylabel('accuracy') ax.set_title(key) plt.annotate('%.3f' % y, xy=(x - 0.02, y + 0.02)) plt.annotate('m=%d' % x, xy=(x - 0.02, y - 0.07)) ax.plot(x, y, 'o-') plt.xlabel('tree number m') plt.savefig('../part3.pdf')
class DecisionTreeTestCase(unittest.TestCase): """Unittest for tree.DecsionTree """ def setUp(self): self.decision_tree = DecisionTree() def tearDown(self): self.decision_tree = None def test_fit(self): # test data X = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]] y = ["yes", "yes", "no", "no", "no"] # X and y is list object feat_names = ['no surfacing', 'flippers'] decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.fit(X, y, feat_names) self.assertEqual(self.decision_tree.tree, decision_tree) # X and y is array feat_names = ['no surfacing', 'flippers'] self.decision_tree.fit(np.asarray(X), np.asarray(y), feat_names) self.assertEqual(self.decision_tree.tree, decision_tree) def test_predict(self): # test 1: training data item = [1, 0] feat_names = ['no surfacing', 'flippers'] result = 'no' decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.tree = decision_tree self.assertEqual(result, self.decision_tree.predict(item, feat_names)) # test 2: training data with different feat_names dataset = [[0, 1], [0, 0]] feat_names = ['flippers', 'no surfacing'] result = ["no", "no"] decision_tree = { 'no surfacing': { 0: 'no', 1: { 'flippers': { 0: 'no', 1: 'yes' } } } } self.decision_tree.tree = decision_tree self.assertEqual(result, self.decision_tree.predict(dataset, feat_names))
#Data normalization X -= X.min() X /= X.max() #Instanciation tree = DecisionTree() #Training tree.train(X_train, y_train) #Test dataset X = np.loadtxt('test_data') y = np.loadtxt('test_labels') X, y = shuffle(X, y) #Data normalization X -= X.min() X /= X.max() #Predictions predictions = tree.predict(X_test) #Report print classification_report(y, predictions) print 'Accuracy: ' + str(accuracy_score(tags, preds))
from sklearn.preprocessing import LabelEncoder from tree import DecisionTree import pandas as pd import numpy as np if __name__ == '__main__': train_df = pd.read_csv('/app/data/train.csv') le_sex = LabelEncoder() le_sex.fit(train_df['Sex']) train_df.loc[:, 'SexInt'] = le_sex.transform(train_df['Sex']) X = np.array(train_df[['SexInt']]) y = train_df['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=71) tree = DecisionTree(max_depth=3) tree.fit(X_train, y_train) print(classification_report(y_train, tree.predict(X_train))) print(classification_report(y_test, tree.predict(X_test))) # tree.make_graph() s_tree = DecisionTreeClassifier(max_depth=3) s_tree.fit(X_train, y_train) print(classification_report(y_train, s_tree.predict(X_train))) print(classification_report(y_test, s_tree.predict(X_test))) s_tree.predict_proba(X_test)
sys.exit() trainFileName = sys.argv[1] testFileName = sys.argv[2] try: m = int(sys.argv[3]) except: print >> sys.stderr, "[ERROR] [m] should be in integer!" sys.exit() attribute, trainset = data_provider(trainFileName) testAttribute, testset = data_provider(testFileName) try: assert (testAttribute == attribute) except AssertionError: print >> sys.stderr, "[ERROR] pls check the attributes of test data." sys.exit() # train root = TreeNode(trainset, attribute) curTree = DecisionTree(root) curTree.createTree(root, m) curTree.printTree(root, 0) # test print '<Predictions for the Test Set Instances>' index = 1 for instance in testset: print '{}: Actual: {} Predicted: {}'.format( index, instance[-1], curTree.predict(root, instance)) index += 1
from sklearn.cross_validation import train_test_split from sklearn import metrics import numpy as np from tree import DecisionTree # load data X = np.loadtxt('../feature/5grams_count_mc_features') y = np.loadtxt('../data/tag_mc') X -= X.min() X /= X.max() X_train, X_test, y_train, y_test = train_test_split(X, y) tree = DecisionTree() tree.train(X_train, y_train) expected = y_test predicted = tree.predict(X_test) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted))
from tree import DecisionTree from iris_dataset import vectors, labels N = int(len(vectors)*0.8) training_vectors = vectors[:N] training_labels = labels[:N] test_vectors = vectors[N:] test_labels = labels[N:] tree = DecisionTree(leaf_size=1, n_trials=1) tree.fit(training_vectors, training_labels) results = tree.predict(test_vectors) tree.show() print("results:{}".format(results)) print("answers:{}".format(test_labels))
def main(): X, y = read_data('crx.data.txt') n_samples = X.shape[0] n_folds = 3 n_samples_per_fold = n_samples / n_folds cum_accuracy = 0.0 cum_p = 0.0 cum_r = 0.0 fold = 0 """ clf = DecisionTree(maxdepth=3) clf.fit(X, y) clf.print_tree() y_pred = clf.predict(X) print y.astype(np.int32) return """ for train_idx, test_idx in kfold(n_samples, n_folds): print "Fold", fold fold += 1 X_train, X_test = X[train_idx], X[test_idx] y_train, y_test = y[train_idx], y[test_idx] clf = DecisionTree(maxdepth=3) clf.fit(X_train, y_train) #clf.print_tree() y_pred = clf.predict(X_test) # TP, FP, TN and FN tp = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 1 and y_test[i] == 1]) tn = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 0 and y_test[i] == 0]) fp = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 1 and y_test[i] == 0]) fn = sum([1 for i in xrange(len(y_pred)) if y_pred[i] == 0 and y_test[i] == 1]) # accuracy for this fold acc = float(tp + tn)/(tp + tn + fp + fn) cum_accuracy += acc print "\tAccuracy:", acc # precision, recall try: p = float(tp) / (tp + fp) r = float(tp) / (tp + fn) cum_p += p cum_r += r f1 = 2 * p * r / (p + r) print "\tPrecision:", p print "\tRecall:", r print "\tF1:", f1 except: # divide by zero pass print print "Average accuracy:", cum_accuracy/n_folds print "Average precision:", cum_p/n_folds print "Average recall:", cum_r/n_folds """
from pprint import pprint #导入数据 data = pd.read_table('Font_dataset.txt', header=None, sep=',') #特征数据和标签 X = data.drop(4, axis=1) y = data[4] from tree import DecisionTree clf = DecisionTree() print(u"*****在自己的决策树上进行10折交叉验证*****") test_accuracy = [] L = X.shape[0] kf = KFold(L, n_folds=10, random_state=2018) count = 0 for train_index, test_index in kf: count += 1 X_train, X_test = X.values[train_index], X.values[test_index] y_train, y_test = y.values[train_index], y.values[test_index] #训练 clf.fit(X.values, y.values) #测试 test_pre = clf.predict(X_test) test_acc = accuracy_score(y_test, test_pre) test_accuracy.append(test_acc) print('%d test accuracy_score :%.4f' % (count, test_acc)) print('mean test accuracy_score :%.4f' % np.mean(test_accuracy))