Пример #1
0
def part2():
    """randomly choose 5%, 10%, 20%, 50%, 100% samples to train, and choose 10 sets each time"""
    plt.figure()
    for trainFileName, testFileName, key in [
        ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'),
        ('../heart_train.arff', '../heart_test.arff', 'heart')
    ]:
        attribute, trainset = data_provider(trainFileName)
        testAttribute, testset = data_provider(testFileName)
        m = 4
        avgPoints = []
        maxPoints = []
        minPoints = []
        for rate in (0.05, 0.1, 0.2, 0.5, 1):
            accuracys = []
            for newTrainset in selectSample(trainset, rate):
                root = TreeNode(newTrainset, attribute)
                curTree = DecisionTree(root)
                curTree.createTree(root, m)
                trueSamples = 0
                falseSamples = 0
                for instance in testset:
                    if curTree.predict(root, instance) == instance[-1]:
                        trueSamples += 1
                    else:
                        falseSamples += 1
                accuracys.append(
                    float(trueSamples) / (trueSamples + falseSamples))
            accuracy = float(sum(accuracys)) / len(accuracys)
            avgPoints.append([int(rate * 100), accuracy])
            maxPoints.append([int(rate * 100), max(accuracys)])
            minPoints.append([int(rate * 100), min(accuracys)])

        mapping = {'diabetes': 1, 'heart': 2}
        ax = plt.subplot(1, 2, mapping[key])
        ax.set_xlim(0, 105)
        ax.set_ylim(0.45, 0.9)
        ax.set_ylabel('accuracy')
        ax.set_title(key)
        ax.plot([x[0] for x in avgPoints], [x[1] for x in avgPoints],
                label='average')
        ax.plot([x[0] for x in maxPoints], [x[1] for x in maxPoints],
                label='maximum')
        ax.plot([x[0] for x in minPoints], [x[1] for x in minPoints],
                label='minimum')
        ax.legend()
    plt.xlabel('dataset sample percentage')
    plt.savefig('../part2.pdf')
    def __init__(self, n_trees, tree_config=None):

        # By default we want some randomness in the trees
        default_tree_config = dict(cut_dim="random_best")
        tree_config = {**default_tree_config, **(tree_config or {})}

        self.trees = [DecisionTree(**tree_config) for i in range(n_trees)]
Пример #3
0
def part1(t_data, v_data):
    tree = DecisionTree()
    tree_list = [tree]
    ts = time.time()
    create_dt_classifier(t_data, 9, tree, 0)
    print("Completed ", " Time : ", (time.time() - ts))

    train_accuracy_list = []
    val_accuracy_list = []
    iterations = []

    for i in range(0, 10):

        train_accuracy = check_accuracy_with_trees(t_data, tree_list, i)
        train_accuracy_list.append(train_accuracy)
        val_accuracy = check_accuracy_with_trees(v_data, tree_list, i)
        val_accuracy_list.append(val_accuracy)
        iterations.append(i)

    accuracy = [train_accuracy_list, val_accuracy_list]
    iters = [iterations, iterations]

    print("Completed ", " Time : ", (time.time() - ts))

    print(accuracy)
    legends = ["Training", "Validation"]
    labels = ["Accuracy in %", "Depth"]
    plot(iters, accuracy, "Accuracy Vs Depth", legends, labels)
Пример #4
0
def create_tree(_dataset):
    '''

    :param _dataset: dataset that will be assigned to the root of the tree
    :return: return a DecisionTree object (not trained)
    '''
    return DecisionTree(_dataset)
Пример #5
0
    def __init__(self, x, y, depth=2):
        """
        In the constructor we instantiate nn.Linear modules and assign them as
        member variables.
        """

        H = 100
        D_out = 1
        self.dTree = DecisionTree(x, y, idxs=range(len(y)), depth=2)

        super(treeNet, self).__init__()
        self.linear1 = torch.nn.Linear(self.dTree.D_in, H).cuda()
        self.theta = torch.nn.ModuleList([
            torch.nn.Linear(self.dTree.D_in, D_out)
            for i in range(self.dTree.nNodes)
        ]).cuda()
        self.sigmoid = torch.nn.Sigmoid().cuda()
Пример #6
0
def create_dt_classifier(data, depth, tree):
    # print(data.shape, " Depth : ", depth)
    if depth == 0:
        prediction = ada_get_leaf_prediction_value(data)
        tree.insert(None, None, prediction, True)
        return
    u_root = ada_gini_function(data[:, 0:2])
    # print(u_root)
    gain = 0
    feature_index = 0
    threshold = 0

    for i in range(2, data.shape[1]):
        feature_index_current, gain_current, threshold_current = get_feature_gain(
            data, i, u_root)
        # print("gain_current : ", gain_current, " threshold_current : ", threshold, " feature_index_current", feature_index_current)
        if gain_current > gain:
            gain = gain_current
            feature_index = feature_index_current
            threshold = threshold_current
        # break
        # print("gain : ", gain, " threshold : ", threshold, " feature_index", feature_index)

    if gain == 0:
        prediction = ada_get_leaf_prediction_value(data)
        tree.insert(None, None, prediction, True)
        return

    depth = depth - 1

    sorted_vals = data[np.argsort(data[:, feature_index])[::1]]

    val = np.split(sorted_vals,
                   np.where(sorted_vals[:, feature_index] >= threshold)[0][:1])
    # true_space = data[data[:, feature_index] >= threshold]
    true_space = val[1]
    false_space = val[0]
    # false_space = data[data[:, feature_index] < threshold]
    prediction = ada_get_leaf_prediction_value(data)
    tree.insert(threshold, feature_index, prediction, False)
    tree.left = DecisionTree()
    tree.right = DecisionTree()
    create_dt_classifier(true_space, depth, tree.left)
    create_dt_classifier(false_space, depth, tree.right)
Пример #7
0
def compare_algorithm():
    skCount = 0
    samCount = 0

    data, targets, headers = get_voting()

    #split dataset into random parts
    train_data, test_data, train_target, test_target = split_data(data, targets)

    #reset the indexes so the dataframe can be properly parsed.
    train_data.reset_index(inplace=True, drop=True)
    test_data.reset_index(inplace=True, drop=True)
    train_target.reset_index(inplace=True, drop=True)
    test_target.reset_index(inplace=True, drop=True)

    #get the trees initialized
    samClassifier = DecisionTree()
    skClassifer = tree.DecisionTreeClassifier()

    #build trees
    samModel = samClassifier.fit(train_data, train_target, headers)
    skModel = skClassifer.fit(train_data, train_target)

    #get the predictions
    samPredicted = samModel.predict(test_data)
    skPredicted = skModel.predict(test_data)

    #this is important because this is how we can 
    #measure the accuracy
    test_target = test_target[headers[-1]]

    #loop through the program and measure the accuracy
    for index in range(len(test_data)):
        if skPredicted[index] == test_target[index]:
            skCount += 1

        if samPredicted[index] == test_target[index]:
            samCount += 1

    #get the accuracy rating
    samAccuracy = get_accuracy(samCount, len(test_data))
    skAccuracy = get_accuracy(skCount, len(test_data))

    print("Sam's ID3 Accuracy: {:.2f}%. \nSK's ID3 Accuracy: {:.2f}%.".format(samAccuracy, skAccuracy))
Пример #8
0
def create_dt_classifier(data, depth, tree, m):
    # print("Depth : ", depth)
    # print("Data Shape : ", data.shape)
    if depth == 0:
        prediction = get_leaf_prediction_value(data)
        tree.insert(None, None, prediction, True)
        return
    u_root = gini_function(data[:, 0])
    gain = 0
    feature_index = 0
    threshold = 0

    feature_sampled_data, random_indexes = get_sampled_features(data, m)
    for i in range(1, feature_sampled_data.shape[1]):
        feature_index_current, gain_current, threshold_current = get_feature_gain(feature_sampled_data, i, u_root)

        if gain_current > gain:
            gain = gain_current
            feature_index = feature_index_current
            threshold = threshold_current

    if gain == 0:
        prediction = get_leaf_prediction_value(data)
        tree.insert(None, None, prediction, True)
        return

    depth = depth - 1

    if random_indexes is not None:
        feature_index = random_indexes[feature_index]

    sorted_vals = data[np.argsort(data[:, feature_index])[::1]]

    val = np.split(sorted_vals, np.where(sorted_vals[:, feature_index] >= threshold)[0][:1])
    # true_space = data[data[:, feature_index] >= threshold]
    true_space = val[1]
    false_space = val[0]
    # false_space = data[data[:, feature_index] < threshold]
    prediction = get_leaf_prediction_value(data)
    tree.insert(threshold, feature_index, prediction, False)
    tree.left = DecisionTree()
    tree.right = DecisionTree()
    create_dt_classifier(true_space, depth, tree.left, m)
    create_dt_classifier(false_space, depth, tree.right, m)
Пример #9
0
def main():
    #import the data from a csv
    car_data = np.genfromtxt('car_data.csv', delimiter=',')

    #call the tree creator module and pass the name of the json file to it
    dTree = DecisionTree('jsonTrees/' + testName + '.json')

    scores = []
    #track the best score and data
    best = [0, 0]
    test = []

    for data in car_data:

        #change the inputs for each of the cars in the tree
        dTree.changeInputs(convertCarData(data))
        #get the score for that car
        score = dTree.run()

        #check if its the highest
        if (score > best[0]):
            best[0] = score
            best[1] = data
        #add it to the list of scores
        scores.append(score)
        test.append([score, data.tolist()])

    #print("best",best[0],best[1])

    test = sorted(test, key=lambda x: x[0], reverse=True)

    pprint(test[0:3])

    #create a normilized histagram of the scores
    n, bins, patches = plt.hist(scores,
                                normed=1,
                                facecolor='green',
                                alpha=0.75)
    plt.title(testName + wValue)
    #save the image to a file
    plt.savefig("graphs/" + testName + wValue + ".png", bbox_inches='tight')
    #show the image
    plt.show()
Пример #10
0
def execute_algorithm(dataset):
    #we all know that this whole shell is designed just for the Decision Tree
    classifier = DecisionTree()

    #determine which dataset to retrieve
    if (dataset == 1):
        data, targets, headers = get_loans()
    elif (dataset == 2):
        data, targets, headers = get_voting()
    count = 0

    #split dataset into random parts
    train_data, test_data, train_target, test_target = split_data(data, targets)

    #reset the indexes so the dataframe can be properly parsed.
    train_data.reset_index(inplace=True, drop=True)
    test_data.reset_index(inplace=True, drop=True)
    train_target.reset_index(inplace=True, drop=True)
    test_target.reset_index(inplace=True, drop=True)

    #build the tree!
    model = classifier.fit(train_data, train_target, headers)

    #prompt the user if he/she wants to display the tree
    print_id3(model)

    #target_predicted is an array of predictions that is received by the predict
    target_predicted = model.predict(test_data)

    #this allows us to know which column is the target
    test_target = test_target[headers[-1]]

    #loop through the target_predicted and count up the correct predictions
    for index in range(len(target_predicted)):
        #increment counter for every match from
        #target_predicted and test_target
        if target_predicted[index] == test_target[index]:
            count += 1

    accuracy = get_accuracy(count, len(test_data))

    #report to the user
    print("Accuracy: {:.2f}%".format(accuracy))
Пример #11
0
def etrims_tree(n_hidden = [1000], coef = [1000.], size=6):
    print_time('tree2etrims test size is %d' % size)
    print_time('load_etrims')
    train_data, train_signal, test_data, test_signal = load_etrims(size=size)

    num_function = 100
    print_time('train_DecisionTree num function is %d' % num_function)
    dt = DecisionTree(num_function=num_function)
    dt.fit(train_data, train_signal)

    print_time('test_DecisionTree')
    score = dt.score(test_data, test_signal)
    print_time('score is %f' % score)

    print_time('DecisionTree info')
    dt.info()


    elm_hidden = [(2*size+1)*(2*size+1)*2]
    print_time('train_ExtremeDecisionTree elm_hidden is %d, num function is %d' % (elm_hidden[0], num_function))
    edt = ExtremeDecisionTree(elm_hidden=elm_hidden, elm_coef=None, num_function=num_function)
    edt.fit(train_data, train_signal)

    print_time('test_ExtremeDecisionTree')
    score = edt.score(test_data, test_signal)
    print_time('score is %f' % score)

    print_time('test_ExtremeDecisionTree')
    score = edt.score(test_data, test_signal)
    print_time('score is %f' % score)

    print_time('ExtremeDecisionTree info')
    edt.info()

    print_time('tree2etrims test is finished !')
Пример #12
0
class DecisionTreeC45TestCase(unittest.TestCase):
    """

    """
    def setUp(self):
        self.decision_tree = DecisionTree("c4.5")

    def tearDown(self):
        self.decision_tree = None

    def test_fit(self):
        # test data
        X = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]]
        y = ["yes", "yes", "no", "no", "no"]
        # X and y is list object
        feat_names = ['no surfacing', 'flippers']
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.fit(X, y, feat_names)
        self.assertEqual(self.decision_tree.tree, decision_tree)

        # X and y is array
        feat_names = ['no surfacing', 'flippers']
        self.decision_tree.fit(np.asarray(X), np.asarray(y), feat_names)
        self.assertEqual(self.decision_tree.tree, decision_tree)

    def test_predict(self):
        # There is no need to test predict.
        # Because, predict is not about criterion, in test_predict.
        pass
Пример #13
0
def part3():
    points = {}
    plt.figure()
    for trainFileName, testFileName, key in [
        ('../diabetes_train.arff', '../diabetes_test.arff', 'diabetes'),
        ('../heart_train.arff', '../heart_test.arff', 'heart')
    ]:
        attribute, trainset = data_provider(trainFileName)
        testAttribute, testset = data_provider(testFileName)
        root = TreeNode(trainset, attribute)
        curTree = DecisionTree(root)

        points = []
        for m in (2, 5, 10, 20):
            curTree.createTree(root, m)
            trueSamples = 0
            falseSamples = 0
            for instance in testset:
                if curTree.predict(root, instance) == instance[-1]:
                    trueSamples += 1
                else:
                    falseSamples += 1
            points.append(
                [m, float(trueSamples) / (trueSamples + falseSamples)])

        mapping = {'diabetes': 1, 'heart': 2}
        for x, y in points:
            ax = plt.subplot(2, 1, mapping[key])
            ax.set_xlim(0, 22)
            ax.set_ylim(0.6, 0.8)
            ax.set_ylabel('accuracy')
            ax.set_title(key)
            plt.annotate('%.3f' % y, xy=(x - 0.02, y + 0.02))
            plt.annotate('m=%d' % x, xy=(x - 0.02, y - 0.07))
            ax.plot(x, y, 'o-')

    plt.xlabel('tree number m')
    plt.savefig('../part3.pdf')
Пример #14
0
def mnist_mlelm(n_hidden=[1000]):
    print "hidden:", n_hidden

    # initialize
    train_set, valid_set, test_set = load_mnist()
    train_data, train_target = train_set
    valid_data, valid_target = valid_set
    test_data, test_target = test_set
    
    # size
    train_size = 500 # max 50000
    valid_size = 10 # max 10000
    test_size = 10 # max 10000

    train_data, train_target = train_data[:train_size], train_target[:train_size]
    valid_data, valid_target = valid_data[:valid_size], valid_target[:valid_size]
    test_data, test_target = test_data[:test_size], test_target[:test_size]

    # add valid_data/target to train_data/target
    """
    train_data   = train_data   + valid_data
    train_target = train_target + valid_target
    """

    # model
    dt = DecisionTree()
    #"""
    edt1 = ExtremeDecisionTree(elm_hidden=n_hidden)
    edt2 = ExtremeDecisionTree(elm_hidden=n_hidden, elm_coef=[1000., 100., 1000.])
    #"""
    
    # fit
    #print "fitting ..."
    dt.fit(train_data, train_target)
    #"""
    edt1.fit(train_data, train_target)
    edt2.fit(train_data, train_target)
    #"""
    
    # test
    print "test score is ",
    score_dt = dt.score(test_data, test_target)
    #"""
    score_edt1 = edt1.score(test_data, test_target)
    score_edt2 = edt2.score(test_data, test_target)
    print score_dt, score_edt1, score_edt2
    #"""
    #print score_dt
    
    print "dt"
    dt.info()
    #"""
    print "edt1"
    edt1.info()
    print "edt2"
    edt2.info()
Пример #15
0
class treeNet(torch.nn.Module):
    def __init__(self, x, y, depth=2):
        """
        In the constructor we instantiate nn.Linear modules and assign them as
        member variables.
        """

        H = 100
        D_out = 1
        self.dTree = DecisionTree(x, y, idxs=range(len(y)), depth=2)

        super(treeNet, self).__init__()
        self.linear1 = torch.nn.Linear(self.dTree.D_in, H).cuda()
        self.theta = torch.nn.ModuleList([
            torch.nn.Linear(self.dTree.D_in, D_out)
            for i in range(self.dTree.nNodes)
        ]).cuda()
        self.sigmoid = torch.nn.Sigmoid().cuda()

    def forward(self, x, idxs=None):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        # if phase is 'train':
        #     self.dTree.mu = self.dTree.mu_train
        #     self.dTree.pi = self.dTree.iter_pi(self.dTree.P,self.dTree.pi,self.dTree.mu)
        # elif phase is 'val':
        #     self.dTree.mu = self.dTree.mu_val
        if idxs is None:
            idxs = range(len(x))

        # h = self.linear1(x.float()).clamp(min=0).cuda()
        # print(f'h {h}')
        y_pred = self.dTree.plant(x, self.theta, idxs=idxs)
        # _, y_pred = y_pred_onehot.max(1)#convert from one-hot encoding to vector

        return y_pred
Пример #16
0
    def calc_misclassification_rate(training_dataframe, validation_dataframe,
                                    criterion):
        err = 0
        x = training_dataframe[categorical_columns]
        y = training_dataframe['num']
        dt = DecisionTree(criterion)
        dt.fit(x, y)
        dt.prune(
            validation_dataframe.loc[:, validation_dataframe.columns != "num"],
            validation_dataframe.loc[:, "num"])
        for i in validation_dataframe.index:
            if (dt.root.evaluate(validation_dataframe.loc[
                    i, validation_dataframe.columns != "num"]) !=
                    validation_dataframe.loc[i, "num"]):
                err += 1
        err = err / len(validation_dataframe)
        print((err, dt))
        return (err, dt)

        gini_trees = calc_misclassification_rate(criterion="gini")
        gtree = max(gini_trees, key=lambda x: x[0])[1]
        print("best gini tree = {}".format(gtree))
        Gg = Digraph("", filename="tree_gini.pdf")
        gtree.plot(Gg)
        Gg.view()
        entropy_trees = calc_misclassification_rate(criterion="entropy")
        etree = max(entropy_trees, key=lambda x: x[0])[1]
        print("best entropy tree = {}".format(etree))
        Ge = Digraph("", filename="tree_entropy.pdf")
        etree.plot(Ge)
        Ge.view()

        fig, ax = plt.subplots(nrows=1, ncols=1)
        clf = tree.DecisionTreeClassifier(criterion="entropy")
        clf = clf.fit(categorical_features, df.num)
        tree.plot_tree(clf, ax=ax)
        plt.savefig("sklearn_entropy")
        plt.show()

        fig, ax = plt.subplots(nrows=1, ncols=1)
        clf = tree.DecisionTreeClassifier(criterion="gini")
        clf = clf.fit(categorical_features, df.num)
        tree.plot_tree(clf, ax=ax)
        plt.savefig("sklearn_gini")
        plt.show()
Пример #17
0
class DecisionTreeTestCase(unittest.TestCase):
    """Unittest for tree.DecsionTree
    """
    def setUp(self):
        self.decision_tree = DecisionTree()

    def tearDown(self):
        self.decision_tree = None

    def test_fit(self):
        # test data
        X = [[1, 1], [1, 1], [1, 0], [0, 1], [0, 1]]
        y = ["yes", "yes", "no", "no", "no"]
        # X and y is list object
        feat_names = ['no surfacing', 'flippers']
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.fit(X, y, feat_names)
        self.assertEqual(self.decision_tree.tree, decision_tree)

        # X and y is array
        feat_names = ['no surfacing', 'flippers']
        self.decision_tree.fit(np.asarray(X), np.asarray(y), feat_names)
        self.assertEqual(self.decision_tree.tree, decision_tree)

    def test_predict(self):
        # test 1: training data
        item = [1, 0]
        feat_names = ['no surfacing', 'flippers']
        result = 'no'
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.tree = decision_tree
        self.assertEqual(result, self.decision_tree.predict(item, feat_names))

        # test 2: training data with different feat_names
        dataset = [[0, 1], [0, 0]]
        feat_names = ['flippers', 'no surfacing']
        result = ["no", "no"]
        decision_tree = {
            'no surfacing': {
                0: 'no',
                1: {
                    'flippers': {
                        0: 'no',
                        1: 'yes'
                    }
                }
            }
        }
        self.decision_tree.tree = decision_tree
        self.assertEqual(result,
                         self.decision_tree.predict(dataset, feat_names))
Пример #18
0
from tree import DecisionTree


#Train dataset
X = np.loadtxt('train_data')
y = np.loadtxt('train_labels')
X, y = shuffle(X, y)


#Data normalization
X -= X.min()
X /= X.max()


#Instanciation
tree = DecisionTree()


#Training
tree.train(X_train, y_train)


#Test dataset
X = np.loadtxt('test_data')
y = np.loadtxt('test_labels')
X, y = shuffle(X, y)


#Data normalization
X -= X.min()
X /= X.max()
Пример #19
0
from tree import DecisionTree

training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]

decison_tree = DecisionTree()

tree = decison_tree.build_tree(training_data)

decison_tree.print_tree(tree)


def pretty_print_leaf_predictions(counts):
    total = sum(counts.values()) * 1.0
    probabilities = {}
    for label in counts.keys():
        probabilities[label] = str(int(counts[label] / total * 100)) + "%"
    return probabilities


pretty_print_leaf_predictions(decison_tree.classify(training_data[0], tree))

testing_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 4, 'Apple'],
    ['Red', 2, 'Grape'],
Пример #20
0
    ts = time.time()

    D = np.full((len(t_data)), 1 / len(t_data))
    print(D)
    ada_test_data = np.insert(t_data, 1, D, axis=1)
    ada_val_data = np.insert(v_data, 1, np.full(len(v_data), 1), axis=1)

    # print(ada_test_data)
    for j in range(0, len(l_list)):
        print("L : ", l_list[j])
        tree_list = []
        alpha_list = []
        ada_test_data = np.insert(t_data, 1, D, axis=1)
        # np.zeros((2, 1))
        for l in range(0, l_list[j]):
            tree = DecisionTree()
            create_dt_classifier(ada_test_data, d, tree)
            tree_list.append(tree)
            alpha = get_params(ada_test_data, tree, d)
            alpha_list.append(alpha)
            print(ada_test_data[:, 1])
            # print(ada_test_data)

        train_accuracy = check_accuracy_with_trees(ada_test_data, tree_list, d,
                                                   alpha_list)
        train_accuracy_list.append(train_accuracy)
        val_accuracy = check_accuracy_with_trees(ada_val_data, tree_list, d,
                                                 alpha_list)
        val_accuracy_list.append(val_accuracy)
        iterations.append(l_list[j])
        # print(train_accuracy)
Пример #21
0
def main():

    fmemFile = File("fmemFile.csv")

    #import the data from a csv
    car_data = np.genfromtxt('car_data.csv', delimiter=',')

    #call the tree creator module and pass the name of the json file to it
    aTree = DecisionTree('jsonTrees/' + testA + '.json')
    nTree = DecisionTree('jsonTrees/' + testN + '.json')
    eTree = DecisionTree('jsonTrees/' + testE + '.json')

    #iterator = car_data[np.random.randint(car_data.shape[0], size=100), :]
    #iterator = car_data
    iterator = inputs2

    for inputs in iterator:

        #change the inputs for each of the cars in the tree

        #inputs = convertCarData(inputs)
        aTree.changeInputs(inputs)
        nTree.changeInputs(inputs)
        eTree.changeInputs(inputs)

        #get the score for that car

        aScore = aTree.run()
        nScore = nTree.run()
        eScore = eTree.run()

        print("Inputs:", inputs)
        print("ASCORE #######:", aScore)
        print("NSCORE #######:", nScore)
        print("ESCORE #######:", eScore)

        eScore = np.array(eScore)

        f1 = MemFunc('trap', aScore)
        X = np.arange(0, 1, .05)

        l1, = plt.plot(X, [f1.memFunc(i) for i in X],
                       c='r',
                       linewidth=2.0,
                       label="AlphaCuts")
        l2, = plt.plot(eScore[:, 0],
                       eScore[:, 1],
                       c='b',
                       linewidth=2.0,
                       label="Extention Principle")
        l3 = plt.axvline(nScore, c='g', linewidth=2.0, label="Crisp")

        plt.legend(handles=[l1, l2, l3])
        plt.title("Regular Title")
        plt.xlabel("Output Score")
        plt.ylabel("Membership Value")

        #Batch Save Rember to remove input
        #plt.savefig("test.png")

        plt.show()
        break
Пример #22
0
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import accuracy_score
from pprint import pprint

#导入数据
data = pd.read_table('Font_dataset.txt', header=None, sep=',')

#特征数据和标签
X = data.drop(4, axis=1)
y = data[4]

from tree import DecisionTree
clf = DecisionTree()

print(u"*****在自己的决策树上进行10折交叉验证*****")
test_accuracy = []
L = X.shape[0]
kf = KFold(L, n_folds=10, random_state=2018)
count = 0
for train_index, test_index in kf:
    count += 1
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    #训练
    clf.fit(X.values, y.values)
    #测试
    test_pre = clf.predict(X_test)
    test_acc = accuracy_score(y_test, test_pre)
Пример #23
0
from sklearn.cross_validation import train_test_split 
from sklearn import metrics
import numpy as np
from tree import DecisionTree

# load data

X = np.loadtxt('../feature/5grams_count_mc_features')
y = np.loadtxt('../data/tag_mc')
X -= X.min()
X /= X.max()
X_train, X_test, y_train, y_test = train_test_split(X, y)

tree = DecisionTree()
tree.train(X_train, y_train)
expected = y_test
predicted = tree.predict(X_test)

# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
Пример #24
0
def printInputs():
    dTree = DecisionTree('jsonTrees/' + testName + '.json')
    dTree.printInputs()
Пример #25
0
from tree import DecisionTree
from iris_dataset import vectors, labels

N = int(len(vectors)*0.8)
training_vectors = vectors[:N]
training_labels = labels[:N]
test_vectors = vectors[N:]
test_labels = labels[N:]

tree = DecisionTree(leaf_size=1, n_trials=1)
tree.fit(training_vectors, training_labels)
results = tree.predict(test_vectors)
tree.show()

print("results:{}".format(results))
print("answers:{}".format(test_labels))
Пример #26
0
def main():
    # Steps to build and prune a decision tree:

    # 1. Prepare dataset.
    headings, dataset = utils.load_dataset()
    random.shuffle(dataset)
    # Split the dataset into training data, test data and pruning data if needed.
    train_data = dataset[:32000]
    test_data = dataset[32000:40000]
    # prune_data = dataset[:]


    # 2. Grow a decision tree from training data based on entropy or gini.
    dt = DecisionTree.build_tree(train_data, DecisionTree.entropy)
    # dt = DecisionTree.build_tree(train_data, DecisionTree.gini)


    # 3. Visualize the tree.
    DecisionTree.plot_tree(dt, headings, conf.org_tree_filepath)
    leaves = DecisionTree.count_leaves(dt)
    print('Leaves count before pruning: %d' % leaves)


    # 4. Run the test data through the tree.
    err = DecisionTree.evaluate(dt, test_data)
    print('Accuracy before pruning: %d/%d = %f' % \
        (len(test_data) - err, len(test_data), (len(test_data) - err) / len(test_data)))


    # 5. Prune the tree.
    #   5.1 REP: REP requires another dataset for pruning, so we need to split the dataset in a different way.

    #   5.2 PP: top-down
    DecisionTree.top_down_pessimistic_pruning(dt)

    #   5.3 PP: bottom-up.
    # DecisionTree.bottom_up_pessimistic_pruning(dt)
    
    #   5.4 MEP
    # DecisionTree.minimum_error_pruning(dt)


    # 6. Visualize the pruned tree.
    DecisionTree.plot_tree(dt, headings, conf.prn_tree_filepath)
    leaves = DecisionTree.count_leaves(dt)
    print('Leaves count after pruning: %d' % leaves)


    # 7. Check if the classification ability is improved after pruning.
    err = DecisionTree.evaluate(dt, test_data)
    print('Accuracy after pruning: %d/%d = %f' % \
        (len(test_data) - err, len(test_data), (len(test_data) - err) / len(test_data)))
Пример #27
0
 def setUp(self):
     self.decision_tree = DecisionTree("c4.5")
Пример #28
0
        sys.exit()
    trainFileName = sys.argv[1]
    testFileName = sys.argv[2]
    try:
        m = int(sys.argv[3])
    except:
        print >> sys.stderr, "[ERROR] [m] should be in integer!"
        sys.exit()

    attribute, trainset = data_provider(trainFileName)
    testAttribute, testset = data_provider(testFileName)
    try:
        assert (testAttribute == attribute)
    except AssertionError:
        print >> sys.stderr, "[ERROR] pls check the attributes of test data."
        sys.exit()

    # train
    root = TreeNode(trainset, attribute)
    curTree = DecisionTree(root)
    curTree.createTree(root, m)
    curTree.printTree(root, 0)

    # test
    print '<Predictions for the Test Set Instances>'
    index = 1
    for instance in testset:
        print '{}: Actual: {} Predicted: {}'.format(
            index, instance[-1], curTree.predict(root, instance))
        index += 1
from sklearn.preprocessing import LabelEncoder
from tree import DecisionTree
import pandas as pd
import numpy as np

if __name__ == '__main__':
    train_df = pd.read_csv('/app/data/train.csv')
    le_sex = LabelEncoder()
    le_sex.fit(train_df['Sex'])
    train_df.loc[:, 'SexInt'] = le_sex.transform(train_df['Sex'])

    X = np.array(train_df[['SexInt']])
    y = train_df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=71)
    tree = DecisionTree(max_depth=3)
    tree.fit(X_train, y_train)

    print(classification_report(y_train, tree.predict(X_train)))
    print(classification_report(y_test, tree.predict(X_test)))

    # tree.make_graph()

    s_tree = DecisionTreeClassifier(max_depth=3)
    s_tree.fit(X_train, y_train)
    print(classification_report(y_train, s_tree.predict(X_train)))
    print(classification_report(y_test, s_tree.predict(X_test)))
    s_tree.predict_proba(X_test)
Пример #30
0
def main():
    X, y = read_data('crx.data.txt')
    n_samples = X.shape[0]
    n_folds = 3
    n_samples_per_fold = n_samples / n_folds

    cum_accuracy =  0.0
    cum_p = 0.0
    cum_r = 0.0
    fold = 0

    """
    clf = DecisionTree(maxdepth=3)
    clf.fit(X, y)
    clf.print_tree()
    y_pred = clf.predict(X)
    print y.astype(np.int32)
    return
    """

    for train_idx, test_idx in kfold(n_samples, n_folds):
        print "Fold", fold
        fold += 1

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
  
        clf = DecisionTree(maxdepth=3)
        clf.fit(X_train, y_train)
        #clf.print_tree()

        y_pred = clf.predict(X_test)

        # TP, FP, TN and FN
        tp = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 1 and y_test[i] == 1])
        tn = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 0 and y_test[i] == 0])
        fp = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 1 and y_test[i] == 0])
        fn = sum([1 for i in xrange(len(y_pred))
                  if y_pred[i] == 0 and y_test[i] == 1])

        # accuracy for this fold
        acc = float(tp + tn)/(tp + tn + fp + fn)
        cum_accuracy += acc
        print "\tAccuracy:", acc

        # precision, recall
        try:
            p = float(tp) / (tp + fp)
            r = float(tp) / (tp + fn)
            cum_p += p
            cum_r += r
            f1 = 2 * p * r / (p + r) 
            print "\tPrecision:", p 
            print "\tRecall:", r
            print "\tF1:", f1
        except:
            # divide by zero
            pass

    print
    print "Average accuracy:", cum_accuracy/n_folds
    print "Average precision:", cum_p/n_folds
    print "Average recall:", cum_r/n_folds

    """