def fit(self, X, Y):
        # if self.loss == "mse":
        #     loss = MSELoss()
        # elif self.loss == "0_1_err":
        #     loss = ClassifyLoss()
        N, M = X.shape
        self.learners = np.empty((self.n_iter, 1), dtype=object)
        self.alpha = np.ones((self.n_iter, 1))  #
        self.weights = np.mat(np.ones((N, 1)) / N)  #样本权重
        Y_pred = np.zeros((N, 1))

        for i in range(0, self.n_iter):  #迭代几次则拟合几棵树
            # use MSE as the surrogate loss when fitting to negative gradients
            t = DecisionTree(classifier=False,
                             max_depth=self.max_depth,
                             criterion="entropy")
            t.fit(X, Y)
            self.learners[i] = t

            Y_pred = t.predict(X)
            errArr = np.mat(np.ones((N, 1)))
            errArr[Y_pred == Y] = 0
            weightedError = self.weights * errArr
            self.alpha[i] = float(0.5 * np.log(
                (1.0 - weightedError) / np.max(weightedError, np.inf)))

            expon = np.multiply(-1 * self.alpha[i] * np.mat(Y).T, Y_pred)
            self.weights = np.multiply(self.weights, np.exp(expon))
            self.weights = self.weights / self.weights.sum()
示例#2
0
文件: test_dt.py 项目: Prev/ITE4005
def test_test():
	dt = DecisionTree(open('data/dt_train.txt'))
	assert dt.test({
		'age': '<=30',
		'income': 'low',
		'student': 'no',
		'credit_rating': 'fair',
	}) == 'no'
示例#3
0
文件: run.py 项目: brapse/ml701-dt
def train():
    print "Training: %s" % options.filename

    training = pandas.read_csv(options.input, sep=' ',header=1, skiprows=[0,2])
    tree = DecisionTree(training, 'poisonouse')
    tree.grow(0.01)

    pickle.dump(tree, open(options.filename, 'wb'))

    print "DONE"
示例#4
0
 def fit(self, x):
     trees = []
     for _ in range(self.num_trees):
         tree = DecisionTree(max_depth=self.max_depth,
                             min_size=self.min_size,
                             features_ratio=self.features_ratio)
         subsample = self.subsample(x, self.sampling_ratio)
         tree.fit(subsample)
         trees.append(tree)
     self.trees = trees
     return trees
示例#5
0
文件: run.py 项目: brapse/ml701-dt
def train():
    print "Training: %s" % options.filename

    training = pandas.read_csv(options.input,
                               sep=' ',
                               header=1,
                               skiprows=[0, 2])
    tree = DecisionTree(training, 'poisonouse')
    tree.grow(0.01)

    pickle.dump(tree, open(options.filename, 'wb'))

    print "DONE"
示例#6
0
    def fit(self, X, Y):
        if self.loss == "mse":
            loss = MSELoss()
        elif self.loss == "crossentropy":
            loss = CrossEntropyLoss()

        # convert Y to one_hot if not already
        if self.classifier:
            Y = to_one_hot(Y.flatten())
        else:
            Y = Y.reshape(-1, 1) if len(Y.shape) == 1 else Y

        N, M = X.shape
        self.out_dims = Y.shape[1]
        self.learners = np.empty((self.n_iter, self.out_dims), dtype=object)
        self.weights = np.ones((self.n_iter, self.out_dims))
        self.weights[1:, :] *= self.learning_rate

        # fit the base estimator
        Y_pred = np.zeros((N, self.out_dims))
        for k in range(self.out_dims):
            t = loss.base_estimator()
            t.fit(X, Y[:, k])
            Y_pred[:, k] += t.predict(X)
            self.learners[0, k] = t

        # incrementally fit each learner on the negative gradient of the loss
        # wrt the previous fit (pseudo-residuals)
        for i in range(1, self.n_iter):
            for k in range(self.out_dims):
                y, y_pred = Y[:, k], Y_pred[:, k]
                neg_grad = -1 * loss.grad(y, y_pred)

                # use MSE as the surrogate loss when fitting to negative gradients
                t = DecisionTree(classifier=False,
                                 max_depth=self.max_depth,
                                 criterion="mse")

                # fit current learner to negative gradients
                t.fit(X, neg_grad)
                self.learners[i, k] = t

                # compute step size and weight for the current learner
                step = 1.0
                h_pred = t.predict(X)
                if self.step_size == "adaptive":
                    step = loss.line_search(y, y_pred, h_pred)

                # update weights and our overall prediction for Y
                self.weights[i, k] *= step
                Y_pred[:, k] += self.weights[i, k] * h_pred
示例#7
0
文件: test_dt.py 项目: Prev/ITE4005
def test_testfile():
	from io import StringIO

	dt = DecisionTree(open('data/dt_train.txt'))
	output = StringIO()
	dt.testfile(open('data/dt_test.txt'), output)

	contents = output.getvalue()

	for line in contents.split("\n"):
		if not len(line):
			continue

		last_elm = line.split("\t")[-1]
		assert last_elm in ('Class:buys_computer', 'yes', 'no')
示例#8
0
文件: rf.py 项目: yaohusama/ai
 def fit(self, X, Y):
     """
     Create `n_trees`-worth of bootstrapped samples from the training data
     and use each to fit a separate decision tree.
     """
     self.trees = []
     for _ in range(self.n_trees):
         X_samp, Y_samp = bootstrap_sample(X, Y)
         tree = DecisionTree(
             n_feats=self.n_feats,
             max_depth=self.max_depth,
             criterion=self.criterion,
             classifier=self.classifier,
         )
         tree.fit(X_samp, Y_samp)
         self.trees.append(tree)
示例#9
0
def accuracy_test(rows):
    """Performs cross-out-one validation methods on the dataset passed."""
    # Removing the label
    rows = rows[1:]

    total_accuracy = 0.0
    total_elements = len(rows)

    for i, row in enumerate(rows):
        singled_out = row
        rows.remove(row)
        remaining_rows = rows
        actual_label = row[-1]

        dt = DecisionTree(remaining_rows)
        node = dt.build_tree()
        #node = build_tree(remaining_rows)
        result = dt.classify(singled_out, node)

        # either 'yes' or 'no' prediction
        if len(result) == 1:
            if actual_label in result:
                total_accuracy += 1
                continue
            else:
                total_accuracy += 0
                continue
        # both 'yes' and 'no' prediction
        else:
            prob_correct = result[actual_label]
            if actual_label == 'yes':
                incorrect_label = 'no'
            else:
                incorrect_label = 'yes'
            prob_incorrect = result[incorrect_label]

            total_prob = prob_correct + prob_incorrect
            accuracy_for_this_test = prob_correct / total_prob

        total_accuracy += accuracy_for_this_test

        rows.append(singled_out)

    final_accuracy = (total_accuracy / total_elements)

    return final_accuracy
示例#10
0
def test_regressor():
    boston = load_boston()
    X, Xtest, Y, Ytest = train_test_split(boston.data,
                                          boston.target,
                                          test_size=0.2,
                                          random_state=0)

    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X, Y)
    y_pred = regressor.predict(Xtest)
    err = mean_squared_error(Ytest, y_pred)
    print(err)  # 32.41637254901961

    from dt import DecisionTree
    mine = DecisionTree(criterion="mse", classifier=False)
    mine.fit(X, Y)
    y_pred = mine.predict(Xtest)
    err = mean_squared_error(Ytest, y_pred)
    print(err)  # 32.74450980392157
示例#11
0
文件: test_dt.py 项目: Prev/ITE4005
def test_tree():
	dt = DecisionTree(open('data/dt_train.txt'))
	assert dt.tree[0] == 'age'

	assert dt.tree[1]['<=30'][0] == 'student'
	assert dt.tree[1]['<=30'][1]['yes'] == 'yes'
	assert dt.tree[1]['<=30'][1]['no'] == 'no'

	assert dt.tree[1]['31...40'] == 'yes'

	assert dt.tree[1]['>40'][0] == 'credit_rating'
	assert dt.tree[1]['>40'][1]['excellent'] == 'no'
	assert dt.tree[1]['>40'][1]['fair'] == 'yes'
示例#12
0
def models():  # models function
    iris = load_iris()  # load the sklearn iris data set
    feature = iris.data[:, :2]  # set the features of the data
    label = iris.target  # set the label as the target
    X_train, X_test, y_train, y_test = train_test_split(
        feature, label, random_state=42)  # split the data into train and test
    """
    ### Created Decision Tree Model ###
    """
    scratch_dt_model = DecisionTree(
        max_depth=2,  # create our decision tree model with params
        min_splits=10)
    scratch_dt_model.fit(X_train, y_train)  # fit the model
    scratch_dt_model_pred = scratch_dt_model.pred(
        X_test)  # create predicitons from the model
    """
    ### Sklearn Decision Tree Model ###
    """
    sk_dt_model = DecisionTreeClassifier(
        max_depth=2,  # use the decision tree model from Sklearn with params
        min_samples_split=10)
    sk_dt_model.fit(X_train, y_train)  # fit the model
    sk_dt_model_pred = sk_dt_model.predict(
        X_test)  # create predicitons from the model
    """
    ### Results ###
    """
    print("Scratch Model Accuracy : {0}".format(
        acc_score(scratch_dt_model_pred,
                  y_test)))  # print the scratch models accuracy score
    print("SK-Learn Model Accuracy : {0}".format(
        acc_score(sk_dt_model_pred,
                  y_test)))  # print the sklearn models accuracy score
    print(
        list(zip(scratch_dt_model_pred, sk_dt_model_pred, y_test))
    )  # print the scratch models prediction, sklearn models prediction, and the actual value
示例#13
0
    args = parser.parse_args()
    # load the train and test data
    xTrain = pd.read_csv(args.xTrain)
    yTrain = pd.read_csv(args.yTrain)
    xTest = pd.read_csv(args.xTest)
    yTest = pd.read_csv(args.yTest)
    # create an instance of the decision tree using gini

    maxDepth = list(np.arange(1, 10))
    minLeaf = list(np.arange(1, 10))
    yDepth = np.ones((9, 2))
    yLeaf = np.ones((9, 2))

    for m in maxDepth:
        dt = DecisionTree('gini', m, 5)
        trainauc, testauc = dt_train_test(dt, xTrain, yTrain, xTest, yTest)
        yDepth[m - 1, 0] = trainauc
        yDepth[m - 1, 1] = testauc

    for l in minLeaf:
        dt = DecisionTree('gini', 5, l)
        trainauc, testauc = dt_train_test(dt, xTrain, yTrain, xTest, yTest)
        yLeaf[l - 1, 0] = trainauc
        yLeaf[l - 1, 1] = testauc

    plt.subplot(1, 2, 1)
    plt.plot(maxDepth, yDepth[:, 0], color='g', label='Train')
    plt.plot(maxDepth, yDepth[:, 1], color='r', label='Test')
    plt.xlabel('Tree Depth')
    plt.ylabel('Accuracy')
示例#14
0
def test_DecisionTree():
    i = 1
    np.random.seed(12345)
    while True:
        n_ex = np.random.randint(2, 100)
        n_feats = np.random.randint(2, 100)
        max_depth = np.random.randint(1, 5)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(
                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
            )
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            def loss(yp, y):
                return 1 - accuracy_score(yp, y)

            criterion = np.random.choice(["entropy", "gini"])
            mine = DecisionTree(
                classifier=classifier, max_depth=max_depth, criterion=criterion
            )
            gold = DecisionTreeClassifier(
                criterion=criterion,
                max_depth=max_depth,
                splitter="best",
                random_state=i,
            )
        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = DecisionTree(
                criterion=criterion, max_depth=max_depth, classifier=classifier
            )
            gold = DecisionTreeRegressor(
                criterion=criterion, max_depth=max_depth, splitter="best"
            )

        print("Trial {}".format(i))
        print("\tClassifier={}, criterion={}".format(classifier, criterion))
        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
        if classifier:
            print("\tn_classes: {}".format(n_classes))

        # fit 'em
        mine.fit(X, Y)
        gold.fit(X, Y)

        # get preds on training set
        y_pred_mine = mine.predict(X)
        y_pred_gold = gold.predict(X)

        loss_mine = loss(y_pred_mine, Y)
        loss_gold = loss(y_pred_gold, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_gold_test = gold.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_gold_test = loss(y_pred_gold_test, Y_test)

        try:
            np.testing.assert_almost_equal(loss_mine, loss_gold)
            print("\tLoss on training: {}".format(loss_mine))
        except AssertionError as e:
            print("\tTraining losses not equal:\n{}".format(e))

        try:
            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
            print("\tLoss on test: {}".format(loss_mine_test))
        except AssertionError as e:
            print("\tTest losses not equal:\n{}".format(e))
        i += 1
示例#15
0
    save = pickle.load(f)
    X_train = save['X_train']
    y_train = save['y_train']
    X_test = save['X_test']
    y_test = save['y_test']
    del save

    print('X_train ', X_train.shape)
    print('y_train', y_train.shape)
    print('X_test ', X_test.shape)

y_train = np.reshape(y_train, (y_train.shape[0], 1))
dataset = np.concatenate((X_train, y_train), axis=1)

#Instance of Decision Tree Object
a = DecisionTree(headers, 5)
X_tr = dataset[0:6400]
y_tr = y_train[0:6400]

X_val = X_train[6400:8000]
y_val = y_train[6400:8000]
y_val = np.reshape(y_val, (y_val.shape[0], ))

t = a.train(dataset)
#Saving Trained Model
dill.dump(a, open("vamshi.model", "w"))

v = dill.load(open("vamshi.model"))

y_pred = a.predict(None, X_val)
示例#16
0
    def create_decision_tree(
        local_attributes: np.ndarray,
        local_data: np.ndarray,
        local_output: np.ndarray,
    ) -> DecisionTree:
        """
        in each recursion, we calculate for each remaining attribute a list of unique possible values this iteration.

        from the remainder attributes, we choose the best attribute with the lowest average conditional entropy and
        we get its index and partition entropies

        we create the Node associated to this attribute, and after that,
        for each partition of chosen best node
            we get the value of attribute in current partition
            if partition entropy is 0 => for each value of partition, we have the same output
                we get the output for partition, and create a terminal node, a leaf, with value equal to output of
                partition value
            else
                we filter each other attribute and also the output, so that we only have elements which can be
                descendents of current partition value
                we calculate "data", "attributes", and "output" for a future recursion
                if no further recursion is possible (we have no other attribute to be chosen next)
                    we create a leaf with entropy, and value = count of each output apparition for current partition
                    value
                else
                    we recursively calculate the child of current partition and append it to chosen best node children




        :param local_attributes: array of column names
        :param local_data: array of data for each attribute
        :param local_output: array of output for each row
        :return: a Node in tree along with its children
        """
        nonlocal output_set
        nonlocal node_index
        possible_values_for_attribute = np.array(
            [np.array(list(set(x))) for x in local_data], dtype=object)

        current_best_attribute_index, node_values = get_best_attribute_index(
            local_attributes, local_data, local_output, output_set,
            possible_values_for_attribute)
        current_node = DecisionTree(
            value=local_attributes[current_best_attribute_index],
            node_id=node_index)
        node_index += 1

        for i, v in enumerate(node_values):
            specific_condition = possible_values_for_attribute[
                current_best_attribute_index][i]
            if node_values[i] == 0:
                for value in range(
                        len(local_data[current_best_attribute_index])):
                    if local_data[current_best_attribute_index][
                            value] == specific_condition:
                        node_output = local_output[value]

                child = DecisionTree(node_id=node_index,
                                     value=node_output,
                                     is_leaf=True,
                                     parent_edge=specific_condition,
                                     parent=current_node.node_id,
                                     parent_value=current_node.value)
                node_index += 1
            else:
                filter_array = []
                for value in local_data[current_best_attribute_index]:
                    if value == specific_condition:
                        filter_array.append(True)
                    else:
                        filter_array.append(False)
                future_attributes = np.copy(local_attributes)
                future_attributes = np.delete(future_attributes,
                                              obj=current_best_attribute_index,
                                              axis=0)
                future_data = np.copy(local_data)
                future_data = np.delete(future_data,
                                        obj=current_best_attribute_index,
                                        axis=0)
                aux = []
                for future_row in range(len(future_data)):
                    aux.append(future_data[future_row][filter_array])
                future_data = np.array(aux)
                future_output = np.copy(local_output)[filter_array]
                if len(future_attributes) == 1:
                    unique, counts = np.unique(future_output,
                                               return_counts=True)
                    value = str(dict(zip(unique, counts)))
                    child = DecisionTree(node_id=node_index,
                                         is_leaf=True,
                                         value=value,
                                         parent_edge=specific_condition,
                                         parent=current_node.node_id,
                                         parent_value=current_node.value)
                    node_index += 1
                else:
                    child = create_decision_tree(future_attributes,
                                                 future_data, future_output)
                    child.parent_edge = specific_condition
                    child.parent = current_node.node_id
                    child.parent_value = current_node.value
            current_node.add_children(child)
        return current_node
示例#17
0
import pandas as pd
from importlib import import_module
from sklearn.model_selection import train_test_split
from dt import DecisionTree


dataset_names = ['iris', 'german', 'page_blocks', 'seeds', 'wine']
results = []
columns = ['trial', 'name', 'prune_method', 'loss', 'acc']


for i in range(1, 6):
    for name in dataset_names:
        dataset = import_module('milksets.' + name)
        X_train, X_test, y_train, y_test = train_test_split(*dataset.load(), test_size=0.2, random_state=i)

        for prune_method in ['Reduce', 'Pessim', 'Comp']:
            for loss in ['entropy', 'gini', '0-1']:
                tr = DecisionTree(X_train, y_train, prune_method, loss, name=name)
                tr.postprune()  # Use alpha = 0 by default, since it's the best choice by experiment.
                acc = tr.score(X_test, y_test)
                results.append([i, name, prune_method, loss, acc])


df = pd.DataFrame(results, columns=columns)
df.to_csv('prune_loss_test.csv', index=False)
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

df = pd.read_csv('Training.csv')



X = df.iloc[:, 0:132].values
y = df.iloc[:, -1].values
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

clf = DecisionTree(max_depth=10)
clf.fit(X_train, y_train)
    
y_pred = clf.predict(X_test)
acc = accuracy(y_test, y_pred)

print ("Accuracy:", acc*100, "%")


# from matplotlib import pyplot as plt
def saveModel():
    with open("DTModel", "wb") as f:
        pickle.dump(clf, f)
saveModel()

# header = ['itching','skin_rash','nodal_skin_eruptions','continuous_sneezing','shivering','chills','joint_pain',
示例#19
0
def main():
    if args.modelIdx == '1':
        model = DecisionTree()
    elif args.modelIdx == '2':
        model = BaggedDecisionTrees(n_estimators=50)
    elif args.modelIdx == '3':
        model = RandomForest(n_estimators=50)
    elif args.modelIdx == '4':
        model = BoostedDecisionTrees(n_estimators=50)
    elif args.modelIdx == '5':
        model = SupportVectorMachine()
    elif args.modelIdx == 'A1':
        models = ['DT', 'BDT', 'BODT', 'RF', 'SVM']
        tssp = [0.025, 0.05, 0.125, 0.25]
        num_words = [1000]
        max_depth = [10]
        n_estimators = [50]
        analysis_1(models,
                   tssp,
                   num_words,
                   max_depth,
                   n_estimators,
                   debug=False)
        return
    elif args.modelIdx == 'A2':
        models = ['DT', 'BDT', 'BODT', 'RF', 'SVM']
        tssp = [0.25]
        num_words = [200, 500, 1000, 1500]
        max_depth = [10]
        n_estimators = [50]
        analysis_2(models,
                   tssp,
                   num_words,
                   max_depth,
                   n_estimators,
                   debug=False)
        return
    elif args.modelIdx == 'A3':
        models = ['DT', 'BDT', 'BODT', 'RF', 'SVM']
        tssp = [0.25]
        num_words = [1000]
        max_depth = [5, 10, 15, 20]
        n_estimators = [50]
        analysis_3(models,
                   tssp,
                   num_words,
                   max_depth,
                   n_estimators,
                   debug=False)
        return
    elif args.modelIdx == 'A4':
        models = ['DT', 'BDT', 'BODT', 'RF', 'SVM']
        tssp = [0.25]
        num_words = [1000]
        max_depth = [10]
        n_estimators = [10, 25, 50, 100]
        analysis_4(models,
                   tssp,
                   num_words,
                   max_depth,
                   n_estimators,
                   debug=False)
        return
    else:
        return
    model.train_from_csv(args.trainingDataFilename)
    model.test_from_csv(args.testDataFilename)
示例#20
0
文件: test_dt.py 项目: Prev/ITE4005
def test_majority_voting():
	dt = DecisionTree(open('tests/one_attr.txt'))

	assert dt.tree[1]['<=30'] == 'no'
	assert dt.tree[1]['31...40'] == 'yes'
	assert dt.tree[1]['>40'] == 'yes'
示例#21
0
文件: test_dt.py 项目: Prev/ITE4005
def test_info():
	assert (DecisionTree.info(9, 5) - 0.94) <= 0.001
示例#22
0
def dt_plot():
    fig, axes = plt.subplots(2, 2)
    # fig.set_size_inches(10,10)
    for ax in axes.flatten():
        n_ex = 100
        n_trees = 50  # belong to rf
        n_feats = np.random.randint(2, 100)
        max_depth_d = np.random.randint(1, 100)
        max_depth_r = np.random.randint(1, 10)  # belong to rf

        classifier = np.random.choice([True])  #, False
        # generate samples based different problem(classification-->label,
        # regression-->continous)
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            def loss(yp, y):
                return accuracy_score(yp, y)

            # initialize model
            criterion = np.random.choice(["entropy", "gini"])
            mine_d = DecisionTree(criterion=criterion,
                                  max_depth=max_depth_d,
                                  classifier=classifier)
        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=1)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine_d = DecisionTree(criterion=criterion,
                                  max_depth=max_depth_d,
                                  classifier=classifier)

        # fit 'em
        mine_d.fit(X, Y)
        # get preds on test set
        y_pred_mine_test_d = mine_d.predict(X_test)

        loss_mine_test_d = loss(y_pred_mine_test_d, Y_test)

        if classifier:
            entries = [("DT", loss_mine_test_d, y_pred_mine_test_d)]
            (lbl, test_loss, preds) = entries[np.random.randint(1)]
            ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100))
            for i in np.unique(Y_test):
                ax.scatter(
                    X_test[preds == i, 0].flatten(),
                    X_test[preds == i, 1].flatten(),
                    # s=0.5,
                )
        else:
            X_ax = np.linspace(
                np.min(X_test.flatten()) - 1,
                np.max(X_test.flatten()) + 1, 100).reshape(-1, 1)
            y_pred_mine_test_d = mine_d.predict(X_ax)

            ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5)
            #  s=0.5)
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test_d.flatten(),
                #  linewidth=0.5,
                label="DT".format(max_depth_d),
                color="yellowgreen",
            )
            ax.set_title("DT: {:.1f} ".format(loss_mine_test_d))
            ax.legend()
        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])
    # plt.savefig("plot.png", dpi=300)
    plt.show()
示例#23
0
def ensemble_diff_plot():
    fig, axes = plt.subplots(3, 3)
    fig.set_size_inches(10, 10)
    for ax in axes.flatten():
        n_ex = 100
        n_trees = 50
        n_feats = np.random.randint(2, 100)
        max_depth_d = np.random.randint(1, 100)
        max_depth_r = np.random.randint(1, 10)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            def loss(yp, y):
                return accuracy_score(yp, y)

            # initialize model
            criterion = np.random.choice(["entropy", "gini"])
            mine = RandomForest(
                classifier=classifier,
                n_feats=n_feats,
                n_trees=n_trees,
                criterion=criterion,
                max_depth=max_depth_r,
            )
            mine_d = DecisionTree(criterion=criterion,
                                  max_depth=max_depth_d,
                                  classifier=classifier)
            mine_g = GradientBoostedDecisionTree(
                n_iter=10,
                max_depth=max_depth_d,
                classifier=classifier,
                learning_rate=1,
                loss="crossentropy",
                step_size="constant",
                split_criterion=criterion,
            )

        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=1)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = RandomForest(
                criterion=criterion,
                n_feats=n_feats,
                n_trees=n_trees,
                max_depth=max_depth_r,
                classifier=classifier,
            )
            mine_d = DecisionTree(criterion=criterion,
                                  max_depth=max_depth_d,
                                  classifier=classifier)
            mine_g = GradientBoostedDecisionTree(
                # n_trees=n_trees,
                n_iter=10,
                max_depth=max_depth_d,
                classifier=classifier,
                learning_rate=1,
                loss="mse",
                step_size="adaptive",
                split_criterion=criterion,
            )

        # fit 'em
        mine.fit(X, Y)
        mine_d.fit(X, Y)
        mine_g.fit(X, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_mine_test_d = mine_d.predict(X_test)
        y_pred_mine_test_g = mine_g.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_mine_test_d = loss(y_pred_mine_test_d, Y_test)
        loss_mine_test_g = loss(y_pred_mine_test_g, Y_test)

        if classifier:
            entries = [("RF", loss_mine_test, y_pred_mine_test),
                       ("DT", loss_mine_test_d, y_pred_mine_test_d),
                       ("GB", loss_mine_test_g, y_pred_mine_test_g)]
            (lbl, test_loss, preds) = entries[np.random.randint(3)]
            ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100))
            for i in np.unique(Y_test):
                ax.scatter(
                    X_test[preds == i, 0].flatten(),
                    X_test[preds == i, 1].flatten(),
                    #  s=0.5,
                )
        else:
            X_ax = np.linspace(
                np.min(X_test.flatten()) - 1,
                np.max(X_test.flatten()) + 1, 100).reshape(-1, 1)
            y_pred_mine_test = mine.predict(X_ax)
            y_pred_mine_test_d = mine_d.predict(X_ax)
            y_pred_mine_test_g = mine_g.predict(X_ax)

            ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5)
            #  s=0.5)
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test_g.flatten(),
                #  linewidth=0.5,
                label="GB".format(n_trees, n_feats, max_depth_d),
                color="red",
            )
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test.flatten(),
                #  linewidth=0.5,
                label="RF".format(n_trees, n_feats, max_depth_r),
                color="cornflowerblue",
            )
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test_d.flatten(),
                #  linewidth=0.5,
                label="DT".format(max_depth_d),
                color="yellowgreen",
            )
            ax.set_title("GB: {:.1f} / RF: {:.1f} / DT: {:.1f} ".format(
                loss_mine_test_g, loss_mine_test, loss_mine_test_d))
            ax.legend()
        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])
    # plt.savefig("plot.png", dpi=300)
    plt.show()
    plt.close("all")
示例#24
0
    xTrain = pd.read_csv(args.xTrain)
    yTrain = pd.read_csv(args.yTrain)
    xTest = pd.read_csv(args.xTest)
    yTest = pd.read_csv(args.yTest)
    # create an instance of the decision tree using gini

    
    maxDepth = np.arange(1,10)
    minLeaf = np.arange(1,10)
    trainauc = np.ones((9,9))
    testauc = np.ones((9,9))
    X, Y = np.meshgrid(maxDepth, minLeaf)
        
    for m in maxDepth:
        for l in minLeaf:
            dt = DecisionTree('gini', m, l)
            trainauc[m-1,l-1], testauc[m-1, l-1] = dt_train_test(dt, xTrain, yTrain, xTest, yTest)
    fig = plt.figure(figsize=(9,5))
    ax = plt.axes(projection="3d")
    ax.plot_wireframe(X, Y, trainauc, color='g',label='Train')
    ax.plot_wireframe(X, Y, testauc, color='r',label='Test')
    ax.set_title('3D plot of accuracy using Gini')
    ax.set_xlabel('Min Leaf Samples')
    ax.set_ylabel('Tree Depth')
    ax.set_zlabel('Accuracy')
    ax.legend()
    plt.savefig('q1c.eps', format='eps', dpi=1000)
    plt.show()
    
    
示例#25
0
def cross_validate(csv_file_name,
                   losses_file_name,
                   models,
                   tssp,
                   num_words,
                   max_depth,
                   n_estimators,
                   debug=False):
    '''
		Perform 10-fold incremental cross validation.
	'''
    total_num = 2000
    lists_of_dict = []
    setups = [(p, w, d, t) for p in tssp for w in num_words for d in max_depth
              for t in n_estimators]
    losses = zeros((5, len(setups), 10))  # #models, #cases, #folds
    sklosses = zeros((2, len(setups), 10))
    generate_train_and_test_files_cv(csv_file_name, 10)
    # Generate temp CV files
    for i in range(10):
        lists_of_dict.append(csv_to_dict('cv%d.dat' % (i)))
    i = 0
    for prop, nwords, maxdep, ntrees in setups:
        for j in range(10):
            # Contruct train set
            training_lists_of_dict = lists_of_dict[:j] + lists_of_dict[j + 1:]
            training_list_of_dict = [
                item for sublist in training_lists_of_dict for item in sublist
            ]
            testing_list_of_dict = lists_of_dict[j]
            # Randomly select samples
            random_indices = permutation(len(training_list_of_dict))
            random_indices = random_indices[:int(total_num * prop)]
            training_list_of_dict = [
                training_list_of_dict[k] for k in random_indices
            ]
            # Find the word features
            feature_words = construct_word_feature(training_list_of_dict,
                                                   nwords)
            # Extract features and labels
            training_X, training_y = extract_word_feature_and_label(
                training_list_of_dict, feature_words)
            testing_X, testing_y = extract_word_feature_and_label(
                testing_list_of_dict, feature_words)
            # DT
            if 'DT' in models:
                dt = DecisionTree(max_depth=maxdep)
                t1 = time.time()
                dt.train(training_X, training_y)
                t2 = time.time()
                losses[0, i, j] = dt.test(testing_X, testing_y)
                if debug:
                    print "DT training: %fs, testing: %f" % (t2 - t1,
                                                             time.time() - t2)
            # BDT
            if 'BDT' in models:
                bdt = BaggedDecisionTrees(max_depth=maxdep,
                                          n_estimators=ntrees)
                t1 = time.time()
                bdt.train(training_X, training_y)
                t2 = time.time()
                losses[1, i, j] = bdt.test(testing_X, testing_y)
                if debug:
                    print "BDT training: %fs, testing: %f" % (t2 - t1,
                                                              time.time() - t2)
            # BODT
            if 'BODT' in models:
                bodt = BoostedDecisionTrees(max_depth=maxdep,
                                            n_estimators=ntrees)
                bodt.train(training_X, training_y)
                t2 = time.time()
                losses[2, i, j] = bodt.test(testing_X, testing_y)
            # RF
            if 'RF' in models:
                rf = RandomForest(max_depth=maxdep, n_estimators=ntrees)
                rf.train(training_X, training_y)
                losses[3, i, j] = rf.test(testing_X, testing_y)
            # SVM
            if 'SVM' in models:
                svm = SupportVectorMachine()
                svm.train(training_X, training_y)
                losses[4, i, j] = svm.test(testing_X, testing_y)
            # Libary functions
            if debug:
                training_y[training_y == 0] = -1
                testing_y[testing_y == 0] = -1
                skdt = skDecisionTree(max_depth=maxdep, min_samples_split=10)
                skdt.fit(training_X.T, training_y)
                sklosses[0, i, j] = 1 - skdt.score(testing_X.T, testing_y)
                print "ZERO-ONE-LOSS-SKDT %.4f" % sklosses[0, i, j]
                skrf = skRandomForest(max_depth=maxdep,
                                      n_estimators=ntrees,
                                      min_samples_split=10)
                skrf.fit(training_X.T, training_y)
                sklosses[1, i, j] = 1 - skrf.score(testing_X.T, testing_y)
                print "ZERO-ONE-LOSS-SKRF %.4f" % sklosses[1, i, j]
        i += 1
    save(losses_file_name, losses)
    save('debug_' + losses_file_name, sklosses)
示例#26
0
    final_accuracy = (total_accuracy / total_elements)

    return final_accuracy


if __name__ == '__main__':
    data, filename = process_file()
    print("This is a decision tree classifier program")
    print("\nThe dataset you've chosen is {}".format(filename))
    print("\nIf you wanted to experiment with a different dataset,")
    print("please quit this program and enter:")
    print("`python driver.py data_file.txt`")
    print("You can choose from any files under the 'dataset/' directory.")

    t = DecisionTree(data)
    t.build_tree()

    print("\n\n...Successfully built a classifer based on the datset")
    print("\nIf you want to print the tree, hit 1. Otherwise, hit `enter`:\n")
    see_tree = input()
    if see_tree == '1':
        print("Classification Tree from {} data".format(filename))
        print("=============================================================")
        t.print_tree()
        print("=============================================================")

    print(
        "You can randomly choose a data point, and this program can classify")
    print(
        "that data point for you. Hit 1 if you want to test a random example")