예제 #1
0
def part2():
    """randomly choose 5%, 10%, 20%, 50%, 100% samples to train, and choose 10 sets each time"""
    plt.figure()
    for trainFileName, testFileName, key in [
        ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'),
        ('../heart_train.arff', '../heart_test.arff', 'heart')
    ]:
        attribute, trainset = data_provider(trainFileName)
        testAttribute, testset = data_provider(testFileName)
        m = 4
        avgPoints = []
        maxPoints = []
        minPoints = []
        for rate in (0.05, 0.1, 0.2, 0.5, 1):
            accuracys = []
            for newTrainset in selectSample(trainset, rate):
                root = TreeNode(newTrainset, attribute)
                curTree = DecisionTree(root)
                curTree.createTree(root, m)
                trueSamples = 0
                falseSamples = 0
                for instance in testset:
                    if curTree.predict(root, instance) == instance[-1]:
                        trueSamples += 1
                    else:
                        falseSamples += 1
                accuracys.append(
                    float(trueSamples) / (trueSamples + falseSamples))
            accuracy = float(sum(accuracys)) / len(accuracys)
            avgPoints.append([int(rate * 100), accuracy])
            maxPoints.append([int(rate * 100), max(accuracys)])
            minPoints.append([int(rate * 100), min(accuracys)])

        mapping = {'diabetes': 1, 'heart': 2}
        ax = plt.subplot(1, 2, mapping[key])
        ax.set_xlim(0, 105)
        ax.set_ylim(0.45, 0.9)
        ax.set_ylabel('accuracy')
        ax.set_title(key)
        ax.plot([x[0] for x in avgPoints], [x[1] for x in avgPoints],
                label='average')
        ax.plot([x[0] for x in maxPoints], [x[1] for x in maxPoints],
                label='maximum')
        ax.plot([x[0] for x in minPoints], [x[1] for x in minPoints],
                label='minimum')
        ax.legend()
    plt.xlabel('dataset sample percentage')
    plt.savefig('../part2.pdf')
예제 #2
0
def part3():
    points = {}
    plt.figure()
    for trainFileName, testFileName, key in [
        ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'),
        ('../heart_train.arff', '../heart_test.arff', 'heart')
    ]:
        attribute, trainset = data_provider(trainFileName)
        testAttribute, testset = data_provider(testFileName)
        root = TreeNode(trainset, attribute)
        curTree = DecisionTree(root)

        points = []
        for m in (2, 5, 10, 20):
            curTree.createTree(root, m)
            trueSamples = 0
            falseSamples = 0
            for instance in testset:
                if curTree.predict(root, instance) == instance[-1]:
                    trueSamples += 1
                else:
                    falseSamples += 1
            points.append(
                [m, float(trueSamples) / (trueSamples + falseSamples)])

        mapping = {'diabetes': 1, 'heart': 2}
        for x, y in points:
            ax = plt.subplot(2, 1, mapping[key])
            ax.set_xlim(0, 22)
            ax.set_ylim(0.6, 0.8)
            ax.set_ylabel('accuracy')
            ax.set_title(key)
            plt.annotate('%.3f' % y, xy=(x - 0.02, y + 0.02))
            plt.annotate('m=%d' % x, xy=(x - 0.02, y - 0.07))
            ax.plot(x, y, 'o-')

    plt.xlabel('tree number m')
    plt.savefig('../part3.pdf')
예제 #3
0
class DecisionTreeTestCase(unittest.TestCase):
    """Unittest for tree.DecsionTree
    """
    def setUp(self):
        self.decision_tree = DecisionTree()

    def tearDown(self):
        self.decision_tree = None

    def test_fit(self):
        # test data
        X = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]]
        y = ["yes", "yes", "no", "no", "no"]
        # X and y is list object
        feat_names = ['no surfacing', 'flippers']
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.fit(X, y, feat_names)
        self.assertEqual(self.decision_tree.tree, decision_tree)

        # X and y is array
        feat_names = ['no surfacing', 'flippers']
        self.decision_tree.fit(np.asarray(X), np.asarray(y), feat_names)
        self.assertEqual(self.decision_tree.tree, decision_tree)

    def test_predict(self):
        # test 1: training data
        item = [1, 0]
        feat_names = ['no surfacing', 'flippers']
        result = 'no'
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.tree = decision_tree
        self.assertEqual(result, self.decision_tree.predict(item, feat_names))

        # test 2: training data with different feat_names
        dataset = [[0, 1], [0, 0]]
        feat_names = ['flippers', 'no surfacing']
        result = ["no", "no"]
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.tree = decision_tree
        self.assertEqual(result,
                         self.decision_tree.predict(dataset, feat_names))
예제 #4
0
#Data normalization
X -= X.min()
X /= X.max()


#Instanciation
tree = DecisionTree()


#Training
tree.train(X_train, y_train)


#Test dataset
X = np.loadtxt('test_data')
y = np.loadtxt('test_labels')
X, y = shuffle(X, y)


#Data normalization
X -= X.min()
X /= X.max()


#Predictions
predictions = tree.predict(X_test)


#Report
print classification_report(y, predictions)
print 'Accuracy: ' + str(accuracy_score(tags, preds))
from sklearn.preprocessing import LabelEncoder
from tree import DecisionTree
import pandas as pd
import numpy as np

if __name__ == '__main__':
    train_df = pd.read_csv('/app/data/train.csv')
    le_sex = LabelEncoder()
    le_sex.fit(train_df['Sex'])
    train_df.loc[:, 'SexInt'] = le_sex.transform(train_df['Sex'])

    X = np.array(train_df[['SexInt']])
    y = train_df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=71)
    tree = DecisionTree(max_depth=3)
    tree.fit(X_train, y_train)

    print(classification_report(y_train, tree.predict(X_train)))
    print(classification_report(y_test, tree.predict(X_test)))

    # tree.make_graph()

    s_tree = DecisionTreeClassifier(max_depth=3)
    s_tree.fit(X_train, y_train)
    print(classification_report(y_train, s_tree.predict(X_train)))
    print(classification_report(y_test, s_tree.predict(X_test)))
    s_tree.predict_proba(X_test)
예제 #6
0
        sys.exit()
    trainFileName = sys.argv[1]
    testFileName = sys.argv[2]
    try:
        m = int(sys.argv[3])
    except:
        print >> sys.stderr, "[ERROR] [m] should be in integer!"
        sys.exit()

    attribute, trainset = data_provider(trainFileName)
    testAttribute, testset = data_provider(testFileName)
    try:
        assert (testAttribute == attribute)
    except AssertionError:
        print >> sys.stderr, "[ERROR] pls check the attributes of test data."
        sys.exit()

    # train
    root = TreeNode(trainset, attribute)
    curTree = DecisionTree(root)
    curTree.createTree(root, m)
    curTree.printTree(root, 0)

    # test
    print '<Predictions for the Test Set Instances>'
    index = 1
    for instance in testset:
        print '{}: Actual: {} Predicted: {}'.format(
            index, instance[-1], curTree.predict(root, instance))
        index += 1
예제 #7
0
from sklearn.cross_validation import train_test_split 
from sklearn import metrics
import numpy as np
from tree import DecisionTree

# load data

X = np.loadtxt('../feature/5grams_count_mc_features')
y = np.loadtxt('../data/tag_mc')
X -= X.min()
X /= X.max()
X_train, X_test, y_train, y_test = train_test_split(X, y)

tree = DecisionTree()
tree.train(X_train, y_train)
expected = y_test
predicted = tree.predict(X_test)

# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
예제 #8
0
from tree import DecisionTree
from iris_dataset import vectors, labels

N = int(len(vectors)*0.8)
training_vectors = vectors[:N]
training_labels = labels[:N]
test_vectors = vectors[N:]
test_labels = labels[N:]

tree = DecisionTree(leaf_size=1, n_trials=1)
tree.fit(training_vectors, training_labels)
results = tree.predict(test_vectors)
tree.show()

print("results:{}".format(results))
print("answers:{}".format(test_labels))
예제 #9
0
def main():
    X, y = read_data('crx.data.txt')
    n_samples = X.shape[0]
    n_folds = 3
    n_samples_per_fold = n_samples / n_folds

    cum_accuracy =  0.0
    cum_p = 0.0
    cum_r = 0.0
    fold = 0

    """
    clf = DecisionTree(maxdepth=3)
    clf.fit(X, y)
    clf.print_tree()
    y_pred = clf.predict(X)
    print y.astype(np.int32)
    return
    """

    for train_idx, test_idx in kfold(n_samples, n_folds):
        print "Fold", fold
        fold += 1

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
  
        clf = DecisionTree(maxdepth=3)
        clf.fit(X_train, y_train)
        #clf.print_tree()

        y_pred = clf.predict(X_test)

        # TP, FP, TN and FN
        tp = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 1 and y_test[i] == 1])
        tn = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 0 and y_test[i] == 0])
        fp = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 1 and y_test[i] == 0])
        fn = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 0 and y_test[i] == 1])

        # accuracy for this fold
        acc = float(tp + tn)/(tp + tn + fp + fn)
        cum_accuracy += acc
        print "\tAccuracy:", acc

        # precision, recall
        try:
            p = float(tp) / (tp + fp)
            r = float(tp) / (tp + fn)
            cum_p += p
            cum_r += r
            f1 = 2 * p * r / (p + r) 
            print "\tPrecision:", p 
            print "\tRecall:", r
            print "\tF1:", f1
        except:
            # divide by zero
            pass

    print
    print "Average accuracy:", cum_accuracy/n_folds
    print "Average precision:", cum_p/n_folds
    print "Average recall:", cum_r/n_folds

    """
예제 #10
0
from pprint import pprint

#导入数据
data = pd.read_table('Font_dataset.txt', header=None, sep=',')

#特征数据和标签
X = data.drop(4, axis=1)
y = data[4]

from tree import DecisionTree
clf = DecisionTree()

print(u"*****在自己的决策树上进行10折交叉验证*****")
test_accuracy = []
L = X.shape[0]
kf = KFold(L, n_folds=10, random_state=2018)
count = 0
for train_index, test_index in kf:
    count += 1
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    #训练
    clf.fit(X.values, y.values)
    #测试
    test_pre = clf.predict(X_test)
    test_acc = accuracy_score(y_test, test_pre)
    test_accuracy.append(test_acc)
    print('%d test accuracy_score :%.4f' % (count, test_acc))

print('mean test accuracy_score :%.4f' % np.mean(test_accuracy))