def bagging(X,Y,N,M,F):
# 1. split your dataset into a test set and a "remainder set"
    x_remainder, x_test, y_r, y_test = myevaluation.train_test_split(X, Y)
# 2. using the remainder set, sample N bootsrap samples and use each one to build a classifier
#    for each N sample:
#        ~63% of the remainder set will be sampled into training set
#        ~37% will be leftover for this tree's validation set
    forest = []
    # accuracies = [[0] for i in range(N)]
    accuracies = {}
    for i in range(N):
        x_train, y_train = compute_bootstrapped_sample(x_remainder, y_r) #get the bootstrap sample
        tree = my_class.MyDecisionTreeClassifier()
        tree.fit(x_train, y_train, True, F) #build classifier
        # get remainder of x_train and use as validation set
        x_v = []
        y_v = []
        for j in range(len(x_remainder)):
            if x_remainder[j] not in x_train:
                x_v.append(x_remainder[j])
                y_v.append(y_r[j])
        pred = tree.predict(x_v)
        accuracy = get_accuracy(y_v, pred)
        accuracies[str(i)] = accuracy # {i: accuracy, }
        forest.append(tree)
    
# 3. measure the performance of the tree on the validation set and select the best M of N
#   trees based on the performance metrics
    best_trees_dict = best_M(M, accuracies)
    best_trees = []
    for key in best_trees_dict:
        best_trees.append(forest[int(key)])

    return best_trees
示例#2
0
def tune_parameters(M, N, F, dataset):
    print("M =", M, "N =", N, "F =", F)
    adjusted_dataset = select_random_attributes(F, dataset.data)
    for i in range(5):
        X, y = split_x_y_train(adjusted_dataset)
        x_train, x_test, y_train, y_test = myevaluation.train_test_split(
            X, y, shuffle=True)

        remainder = []

        for j in range(len(x_train)):
            row = x_train[j]
            row.append(y_train[j])
            remainder.append(row)
        myRF = MyRandomForestClassifier()
        myRF.fit(remainder, M, N)
        y_predict_rf = myRF.predict(x_test)
        count = 0
        for l in range(len(y_predict_rf)):
            binned_predict = get_useful_bin(y_predict_rf[l])
            binned_test = get_useful_bin(y_test[l])
            if (binned_predict == binned_test):
                count = count + 1

        accuracy = count / len(y_predict_rf)
        error = (len(y_predict_rf) - count) / len(y_predict_rf)
        print(i, "-- accuracy =", accuracy, "error =", error)
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
        header = ['att' + str(i) for i in range(len(X_train[0]))]
        attribute_domains = {}
        for i, val in enumerate(header):
            attribute_domains[val] = myutils.unique_index(X_train, i)

        self.X_train = X_train
        self.y_train = y_train
        sample_X_train, sample_x_test, sample_y_train, sample_y_test = myevaluation.train_test_split(
            X_train, y_train, test_size=0.33, shuffle=True)
        train = [
            sample_X_train[i] + [sample_y_train[i]]
            for i in range(len(sample_X_train))
        ]

        for _ in range(self.N):
            available_attributes = header.copy()
            self.trees.append(
                myutils.tdidt_forest(
                    myutils.compute_bootstrapped_sample(train),
                    available_attributes, attribute_domains, header, self.F))

        accuracies = []
        for tree in self.trees:
            header = ['att' + str(i) for i in range(len(sample_x_test[0]))]
            prediction = []
            for row in sample_x_test:
                prediction.append(myutils.tdidt_predict(header, tree, row))
            accuracy = 0
            for i in range(len(prediction)):
                if prediction[i] == sample_y_test[i]:
                    accuracy += 1
            accuracy /= len(sample_y_test)
            accuracies.append([accuracy])
        # find m most accurate
        m_trees = []
        for i in range(len(accuracies)):
            accuracies[i].append(i)
        accuracies = sorted(accuracies)
        for i in range(self.M):
            m_trees.append(self.trees[accuracies[-(i + 1)][1]])
        self.trees = m_trees
示例#4
0
def bagging(X, Y, N, M, F):
    # 1. split your dataset into a test set and a "remainder set"
    x_remainder, x_test, y_r, y_test = myevaluation.train_test_split(X, Y)
    # 2. using the remainder set, sample N bootsrap samples and use each one to build a classifier
    #    for each N sample:
    #        ~63% of the remainder set will be sampled into training set
    #        ~37% will be leftover for this tree's validation set
    forest = []
    # accuracies = [[0] for i in range(N)]
    accuracies = {}
    for i in range(N):
        x_train, y_train = compute_bootstrapped_sample(
            x_remainder, y_r)  #get the bootstrap sample
        tree = my_class.MyDecisionTreeClassifier()
        tree.fit(x_train, y_train, True, F)  #build classifier
        # get remainder of x_train and use as validation set
        x_v = []
        y_v = []
        for j in range(len(x_remainder)):
            if x_remainder[j] not in x_train:
                x_v.append(x_remainder[j])
                y_v.append(y_r[j])
        pred = tree.predict(x_v)
        accuracy = get_accuracy(y_v, pred)
        accuracies[str(i)] = accuracy  # {i: accuracy, }
        forest.append(tree)

# 3. measure the performance of the tree on the validation set and select the best M of N
#   trees based on the performance metrics
    best_trees_dict = best_M(M, accuracies)
    best_trees = []
    for key in best_trees_dict:
        best_trees.append(forest[int(key)])
# 4. using majority voting, make predictions from the M learners for each instance in the test set
    all_predictions = []  # [[predictions1],[predictions2]...]
    for tree in best_trees:
        pred = tree.predict(x_test)
        all_predictions.append(pred)  #think about this like flipping a table
    #get the majority for every single row
    pred_header = build_header(
        all_predictions)  #turn all predictions into a mypy
    pred_mypy = MyPyTable(pred_header, all_predictions)
    voted_predictions = []
    for i in range(
            len(all_predictions[0])
    ):  #loop through every x_test, create a column of predictions, pick the pred by majority rule
        pred_col = pred_mypy.get_column(i)
        vals, counts = get_freq_str(pred_col)
        j = counts.index(max(counts))
        y_predict = vals[j]
        voted_predictions.append(y_predict)

    forest_accuracy = get_accuracy(y_test, voted_predictions)
    return best_trees, voted_predictions, forest_accuracy
def test_random_forest_fit():
    # interview dataset
    table = [["Senior", "Java", "no", "no", "False"],
             ["Senior", "Java", "no", "yes", "False"],
             ["Mid", "Python", "no", "no", "True"],
             ["Junior", "Python", "no", "no", "True"],
             ["Junior", "R", "yes", "no", "True"],
             ["Junior", "R", "yes", "yes", "False"],
             ["Mid", "R", "yes", "yes", "True"],
             ["Senior", "Python", "no", "no", "False"],
             ["Senior", "R", "yes", "no", "True"],
             ["Junior", "Python", "yes", "no", "True"],
             ["Senior", "Python", "yes", "yes", "True"],
             ["Mid", "Python", "no", "yes", "True"],
             ["Mid", "Java", "yes", "no", "True"],
             ["Junior", "Python", "no", "yes", "False"]]

    X, y = myutils.split_x_y_train(table)
    x_train, x_test, y_train, y_test = myevaluation.train_test_split(
        X, y, math.floor(len(table) * 0.33), shuffle=True)
    remainder = []
    for i in range(len(x_train)):
        row = x_train[i]
        row.append(y_train[i])
        remainder.append(row)

    print(remainder)

    myRF = MyRandomForestClassifier()
    myRF.fit(remainder, 10, 100)

    y_predicted = myRF.predict(x_test)

    assert len(y_predicted) == len(y_test)

    count = 0
    for i in range(len(y_predicted)):
        if y_predicted[i] == y_test[i]:
            count += 1

    assert count != 0
示例#6
0
from mysklearn.myclassifiers import MyDecisionTreeClassifier
from mysklearn.mypytable import MyPyTable
import mysklearn.myutils as myutils
import mysklearn.myevaluation as myevaluation

stars_table = myutils.load_data("Stars.csv")
temperature = myutils.temp_bins(stars_table.get_column('Temperature'))
L = myutils.luminosity_bins(stars_table.get_column('L'))
R = myutils.get_radius(stars_table.get_column('R'))
a_m = myutils.get_magnitude(stars_table.get_column('A_M'))
color = myutils.categorize_colors(stars_table.get_column('Color'))
spectral_class = myutils.get_spectral_class(stars_table.get_column('Spectral_Class'))
star_type = stars_table.get_column('Type')

x_vals = [[temperature[i], str(L[i]), str(R[i]), str(a_m[i]), color[i], spectral_class[i]] for i in range(len(stars_table.data))]
y_vals = star_type

xtr, xts, ytr, yts = myevaluation.train_test_split(x_vals, y_vals)

my_tree = MyDecisionTreeClassifier()
my_tree.fit(xtr, ytr)

predicted = my_tree.predict(xts)
accuracy = myutils.compute_accuracy(predicted, yts)
print('My Decision Tree: Accuracy =', round(accuracy * 100, 3), 'Error Rate = ', round((1-accuracy) * 100, 3))

# pickle classifier
with open("decision_tree.p", "wb") as fout:
    pkl_obj = my_tree.tree
    pickle.dump(my_tree, fout)
示例#7
0
    def fit(self, X_train, y_train, N, M, F):
        ''' Fits a random forest to the training data

        Args: X_train: the data to train the random forest
                N: number of trees to be generated
                M: "best M" trees
                F: Number of attributes to select from
        '''

        self.N = N 
        self.M = M 
        self.F = F 

        xRemainder, xTest, yRemainder, yTest = myeval.train_test_split(X_train, y_train) # split into remainder and test sets

        remainderSet = []
        testSet = []

        # piece together test and remainder sets
        for i in range(len(xTest)):
            testSet.append(xTest[i] + [yTest[i]])
        for i in range(len(xRemainder)):
            remainderSet.append(xRemainder[i] + [yRemainder[i]] + [i]) # add index at end for uniqueness

        allTrees = []
        allAccuracies = []
    
        for j in range(N):
            
            copySet = copy.deepcopy(remainderSet)
            validationSet = []
            bootstrapSample = []
            bootstrapSample = myutils.computeBootstrappedSample(copySet) # create a bootstrap sample training set

            # determine the validation set
            for i in range(len(remainderSet)):
                if copySet[i] not in bootstrapSample:
                    validationSet.append(copySet[i])
            
            yTest = []
            for i in range(len(validationSet)):
                validationSet[i].pop()
                yTest.append(validationSet[i].pop())

            yTrain = []
            
            #print("BOoT", bootstrapSample)
            for i in range(len(bootstrapSample)):
                yTrain.append(bootstrapSample[i][-2])
                bootstrapSample[i] = bootstrapSample[i][:-2]

            decisionTree = MyDecisionTreeClassifier()
            decisionTree.fit(bootstrapSample, yTrain, F)
            predictions = decisionTree.predict(validationSet)

            currAccuracy = myutils.determineAccuracy(predictions, yTest)
            allAccuracies.append(currAccuracy)
            allTrees.append(decisionTree)

        bestMTrees = []
        for k in range(M):
            index = allAccuracies.index(max(allAccuracies))
            allAccuracies.pop()
            bestMTrees.append(allTrees.pop(index))

        self.bestM = bestMTrees

        return testSet
示例#8
0
    def fit(self, X_train, y_train, M=7, N=20, F=2):
        """Fits a random forest classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples
        """
        self.X_train = copy.deepcopy(X_train)
        self.y_train = copy.deepcopy(y_train)

        # create random stratified test set with 2:1 ratio
        X_remainder, X_test, y_remainder, y_test = myevaluation.train_test_split(
            copy.deepcopy(X_train), copy.deepcopy(y_train))

        for i, x in enumerate(y_remainder):
            X_remainder[i].append(x)
        for i, x in enumerate(y_test):
            X_test[i].append(x)
        # generate N random decision trees using bagging
        trees = []
        for i in range(N):
            # print(i)
            # print("getting sample and validation sets...")
            # get the sample and validation sets
            sample = myutils.compute_bootstrapped_sample(X_remainder)
            validation_set = []
            for x in X_remainder:
                if x not in sample:
                    validation_set.append(x)
            # print("length of sample and validation sets:", len(sample), len(validation_set))
            # print("getting the tree...")
            # get the tree from the sample
            available_attributes = myutils.get_available_attributes(sample)
            tree = myutils.tdidt_random_forest(
                sample, [x for x in range(0,
                                          len(sample[0]) - 1)],
                available_attributes, F)

            # print("testing the tree")
            # test against the validation set
            validation_set_x = [x[:-1] for x in validation_set]
            validation_set_y = [x[-1] for x in validation_set]
            predictions = []
            header = []
            for i in range(0, len(validation_set_x[0])):
                header.append("att" + str(i))
            for x, y in zip(validation_set_x, validation_set_y):
                prediction = myutils.tdidt_predict(header, tree, x)
                predictions.append(int(prediction == y))

            # print("accuracy:", sum(predictions)/len(predictions))
            trees.append({
                "accuracy": sum(predictions) / len(predictions),
                "tree": tree
            })

        # print("getting the best M trees")
        # get the best M of N trees
        trees = sorted(trees, key=lambda k: k["accuracy"], reverse=True)
        self.trees = trees[:M]
示例#9
0
def test_decision_tree_classifier_fit():

    X_tr, X_t, y_tr, y_t = myevaluation.train_test_split(
        interviewData, interviewClasses)
    interviewTest.fit(X_tr, y_tr)
    assert len(interviewTest.best_M_trees) == M