예제 #1
0
def etrims_tree(n_hidden = [1000], coef = [1000.], size=6):
    print_time('tree2etrims test size is %d' % size)
    print_time('load_etrims')
    train_data, train_signal, test_data, test_signal = load_etrims(size=size)

    num_function = 100
    print_time('train_DecisionTree num function is %d' % num_function)
    dt = DecisionTree(num_function=num_function)
    dt.fit(train_data, train_signal)

    print_time('test_DecisionTree')
    score = dt.score(test_data, test_signal)
    print_time('score is %f' % score)

    print_time('DecisionTree info')
    dt.info()


    elm_hidden = [(2*size+1)*(2*size+1)*2]
    print_time('train_ExtremeDecisionTree elm_hidden is %d, num function is %d' % (elm_hidden[0], num_function))
    edt = ExtremeDecisionTree(elm_hidden=elm_hidden, elm_coef=None, num_function=num_function)
    edt.fit(train_data, train_signal)

    print_time('test_ExtremeDecisionTree')
    score = edt.score(test_data, test_signal)
    print_time('score is %f' % score)

    print_time('test_ExtremeDecisionTree')
    score = edt.score(test_data, test_signal)
    print_time('score is %f' % score)

    print_time('ExtremeDecisionTree info')
    edt.info()

    print_time('tree2etrims test is finished !')
예제 #2
0
def mnist_mlelm(n_hidden=[1000]):
    print "hidden:", n_hidden

    # initialize
    train_set, valid_set, test_set = load_mnist()
    train_data, train_target = train_set
    valid_data, valid_target = valid_set
    test_data, test_target = test_set
    
    # size
    train_size = 500 # max 50000
    valid_size = 10 # max 10000
    test_size = 10 # max 10000

    train_data, train_target = train_data[:train_size], train_target[:train_size]
    valid_data, valid_target = valid_data[:valid_size], valid_target[:valid_size]
    test_data, test_target = test_data[:test_size], test_target[:test_size]

    # add valid_data/target to train_data/target
    """
    train_data   = train_data   + valid_data
    train_target = train_target + valid_target
    """

    # model
    dt = DecisionTree()
    #"""
    edt1 = ExtremeDecisionTree(elm_hidden=n_hidden)
    edt2 = ExtremeDecisionTree(elm_hidden=n_hidden, elm_coef=[1000., 100., 1000.])
    #"""
    
    # fit
    #print "fitting ..."
    dt.fit(train_data, train_target)
    #"""
    edt1.fit(train_data, train_target)
    edt2.fit(train_data, train_target)
    #"""
    
    # test
    print "test score is ",
    score_dt = dt.score(test_data, test_target)
    #"""
    score_edt1 = edt1.score(test_data, test_target)
    score_edt2 = edt2.score(test_data, test_target)
    print score_dt, score_edt1, score_edt2
    #"""
    #print score_dt
    
    print "dt"
    dt.info()
    #"""
    print "edt1"
    edt1.info()
    print "edt2"
    edt2.info()
예제 #3
0
    def calc_misclassification_rate(training_dataframe, validation_dataframe,
                                    criterion):
        err = 0
        x = training_dataframe[categorical_columns]
        y = training_dataframe['num']
        dt = DecisionTree(criterion)
        dt.fit(x, y)
        dt.prune(
            validation_dataframe.loc[:, validation_dataframe.columns != "num"],
            validation_dataframe.loc[:, "num"])
        for i in validation_dataframe.index:
            if (dt.root.evaluate(validation_dataframe.loc[
                    i, validation_dataframe.columns != "num"]) !=
                    validation_dataframe.loc[i, "num"]):
                err += 1
        err = err / len(validation_dataframe)
        print((err, dt))
        return (err, dt)

        gini_trees = calc_misclassification_rate(criterion="gini")
        gtree = max(gini_trees, key=lambda x: x[0])[1]
        print("best gini tree = {}".format(gtree))
        Gg = Digraph("", filename="tree_gini.pdf")
        gtree.plot(Gg)
        Gg.view()
        entropy_trees = calc_misclassification_rate(criterion="entropy")
        etree = max(entropy_trees, key=lambda x: x[0])[1]
        print("best entropy tree = {}".format(etree))
        Ge = Digraph("", filename="tree_entropy.pdf")
        etree.plot(Ge)
        Ge.view()

        fig, ax = plt.subplots(nrows=1, ncols=1)
        clf = tree.DecisionTreeClassifier(criterion="entropy")
        clf = clf.fit(categorical_features, df.num)
        tree.plot_tree(clf, ax=ax)
        plt.savefig("sklearn_entropy")
        plt.show()

        fig, ax = plt.subplots(nrows=1, ncols=1)
        clf = tree.DecisionTreeClassifier(criterion="gini")
        clf = clf.fit(categorical_features, df.num)
        tree.plot_tree(clf, ax=ax)
        plt.savefig("sklearn_gini")
        plt.show()
예제 #4
0
class DecisionTreeC45TestCase(unittest.TestCase):
    """

    """
    def setUp(self):
        self.decision_tree = DecisionTree("c4.5")

    def tearDown(self):
        self.decision_tree = None

    def test_fit(self):
        # test data
        X = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]]
        y = ["yes", "yes", "no", "no", "no"]
        # X and y is list object
        feat_names = ['no surfacing', 'flippers']
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.fit(X, y, feat_names)
        self.assertEqual(self.decision_tree.tree, decision_tree)

        # X and y is array
        feat_names = ['no surfacing', 'flippers']
        self.decision_tree.fit(np.asarray(X), np.asarray(y), feat_names)
        self.assertEqual(self.decision_tree.tree, decision_tree)

    def test_predict(self):
        # There is no need to test predict.
        # Because, predict is not about criterion, in test_predict.
        pass
예제 #5
0
파일: dtree.py 프로젝트: csammcgrath/CS450
def compare_algorithm():
    skCount = 0
    samCount = 0

    data, targets, headers = get_voting()

    #split dataset into random parts
    train_data, test_data, train_target, test_target = split_data(data, targets)

    #reset the indexes so the dataframe can be properly parsed.
    train_data.reset_index(inplace=True, drop=True)
    test_data.reset_index(inplace=True, drop=True)
    train_target.reset_index(inplace=True, drop=True)
    test_target.reset_index(inplace=True, drop=True)

    #get the trees initialized
    samClassifier = DecisionTree()
    skClassifer = tree.DecisionTreeClassifier()

    #build trees
    samModel = samClassifier.fit(train_data, train_target, headers)
    skModel = skClassifer.fit(train_data, train_target)

    #get the predictions
    samPredicted = samModel.predict(test_data)
    skPredicted = skModel.predict(test_data)

    #this is important because this is how we can 
    #measure the accuracy
    test_target = test_target[headers[-1]]

    #loop through the program and measure the accuracy
    for index in range(len(test_data)):
        if skPredicted[index] == test_target[index]:
            skCount += 1

        if samPredicted[index] == test_target[index]:
            samCount += 1

    #get the accuracy rating
    samAccuracy = get_accuracy(samCount, len(test_data))
    skAccuracy = get_accuracy(skCount, len(test_data))

    print("Sam's ID3 Accuracy: {:.2f}%. \nSK's ID3 Accuracy: {:.2f}%.".format(samAccuracy, skAccuracy))
예제 #6
0
파일: dtree.py 프로젝트: csammcgrath/CS450
def execute_algorithm(dataset):
    #we all know that this whole shell is designed just for the Decision Tree
    classifier = DecisionTree()

    #determine which dataset to retrieve
    if (dataset == 1):
        data, targets, headers = get_loans()
    elif (dataset == 2):
        data, targets, headers = get_voting()
    count = 0

    #split dataset into random parts
    train_data, test_data, train_target, test_target = split_data(data, targets)

    #reset the indexes so the dataframe can be properly parsed.
    train_data.reset_index(inplace=True, drop=True)
    test_data.reset_index(inplace=True, drop=True)
    train_target.reset_index(inplace=True, drop=True)
    test_target.reset_index(inplace=True, drop=True)

    #build the tree!
    model = classifier.fit(train_data, train_target, headers)

    #prompt the user if he/she wants to display the tree
    print_id3(model)

    #target_predicted is an array of predictions that is received by the predict
    target_predicted = model.predict(test_data)

    #this allows us to know which column is the target
    test_target = test_target[headers[-1]]

    #loop through the target_predicted and count up the correct predictions
    for index in range(len(target_predicted)):
        #increment counter for every match from
        #target_predicted and test_target
        if target_predicted[index] == test_target[index]:
            count += 1

    accuracy = get_accuracy(count, len(test_data))

    #report to the user
    print("Accuracy: {:.2f}%".format(accuracy))
예제 #7
0
class DecisionTreeTestCase(unittest.TestCase):
    """Unittest for tree.DecsionTree
    """
    def setUp(self):
        self.decision_tree = DecisionTree()

    def tearDown(self):
        self.decision_tree = None

    def test_fit(self):
        # test data
        X = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]]
        y = ["yes", "yes", "no", "no", "no"]
        # X and y is list object
        feat_names = ['no surfacing', 'flippers']
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.fit(X, y, feat_names)
        self.assertEqual(self.decision_tree.tree, decision_tree)

        # X and y is array
        feat_names = ['no surfacing', 'flippers']
        self.decision_tree.fit(np.asarray(X), np.asarray(y), feat_names)
        self.assertEqual(self.decision_tree.tree, decision_tree)

    def test_predict(self):
        # test 1: training data
        item = [1, 0]
        feat_names = ['no surfacing', 'flippers']
        result = 'no'
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.tree = decision_tree
        self.assertEqual(result, self.decision_tree.predict(item, feat_names))

        # test 2: training data with different feat_names
        dataset = [[0, 1], [0, 0]]
        feat_names = ['flippers', 'no surfacing']
        result = ["no", "no"]
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.tree = decision_tree
        self.assertEqual(result,
                         self.decision_tree.predict(dataset, feat_names))
from sklearn.preprocessing import LabelEncoder
from tree import DecisionTree
import pandas as pd
import numpy as np

if __name__ == '__main__':
    train_df = pd.read_csv('/app/data/train.csv')
    le_sex = LabelEncoder()
    le_sex.fit(train_df['Sex'])
    train_df.loc[:, 'SexInt'] = le_sex.transform(train_df['Sex'])

    X = np.array(train_df[['SexInt']])
    y = train_df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=71)
    tree = DecisionTree(max_depth=3)
    tree.fit(X_train, y_train)

    print(classification_report(y_train, tree.predict(X_train)))
    print(classification_report(y_test, tree.predict(X_test)))

    # tree.make_graph()

    s_tree = DecisionTreeClassifier(max_depth=3)
    s_tree.fit(X_train, y_train)
    print(classification_report(y_train, s_tree.predict(X_train)))
    print(classification_report(y_test, s_tree.predict(X_test)))
    s_tree.predict_proba(X_test)
예제 #9
0
from tree import DecisionTree
from iris_dataset import vectors, labels

N = int(len(vectors)*0.8)
training_vectors = vectors[:N]
training_labels = labels[:N]
test_vectors = vectors[N:]
test_labels = labels[N:]

tree = DecisionTree(leaf_size=1, n_trials=1)
tree.fit(training_vectors, training_labels)
results = tree.predict(test_vectors)
tree.show()

print("results:{}".format(results))
print("answers:{}".format(test_labels))
예제 #10
0
def main():
    X, y = read_data('crx.data.txt')
    n_samples = X.shape[0]
    n_folds = 3
    n_samples_per_fold = n_samples / n_folds

    cum_accuracy =  0.0
    cum_p = 0.0
    cum_r = 0.0
    fold = 0

    """
    clf = DecisionTree(maxdepth=3)
    clf.fit(X, y)
    clf.print_tree()
    y_pred = clf.predict(X)
    print y.astype(np.int32)
    return
    """

    for train_idx, test_idx in kfold(n_samples, n_folds):
        print "Fold", fold
        fold += 1

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
  
        clf = DecisionTree(maxdepth=3)
        clf.fit(X_train, y_train)
        #clf.print_tree()

        y_pred = clf.predict(X_test)

        # TP, FP, TN and FN
        tp = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 1 and y_test[i] == 1])
        tn = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 0 and y_test[i] == 0])
        fp = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 1 and y_test[i] == 0])
        fn = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 0 and y_test[i] == 1])

        # accuracy for this fold
        acc = float(tp + tn)/(tp + tn + fp + fn)
        cum_accuracy += acc
        print "\tAccuracy:", acc

        # precision, recall
        try:
            p = float(tp) / (tp + fp)
            r = float(tp) / (tp + fn)
            cum_p += p
            cum_r += r
            f1 = 2 * p * r / (p + r) 
            print "\tPrecision:", p 
            print "\tRecall:", r
            print "\tF1:", f1
        except:
            # divide by zero
            pass

    print
    print "Average accuracy:", cum_accuracy/n_folds
    print "Average precision:", cum_p/n_folds
    print "Average recall:", cum_r/n_folds

    """
예제 #11
0
from pprint import pprint

#导入数据
data = pd.read_table('Font_dataset.txt', header=None, sep=',')

#特征数据和标签
X = data.drop(4, axis=1)
y = data[4]

from tree import DecisionTree
clf = DecisionTree()

print(u"*****在自己的决策树上进行10折交叉验证*****")
test_accuracy = []
L = X.shape[0]
kf = KFold(L, n_folds=10, random_state=2018)
count = 0
for train_index, test_index in kf:
    count += 1
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    #训练
    clf.fit(X.values, y.values)
    #测试
    test_pre = clf.predict(X_test)
    test_acc = accuracy_score(y_test, test_pre)
    test_accuracy.append(test_acc)
    print('%d test accuracy_score :%.4f' % (count, test_acc))

print('mean test accuracy_score :%.4f' % np.mean(test_accuracy))