def test(self):
        """Function that tests all depths and method you want. Give it a tester object

        Returns
        -------
        :self.train_err: numpy array of training errs (row: method, col: depth)
        :self.test_err: numpy array of test errs (row: method, col: depth)
        """
        # loop through methods and depths for each method
        for i, method in enumerate(self.methods):
            for j, d in enumerate(self.depths):
                # initialize and make decision tree with specified depth and method
                treeInit = None
                dt = None
                treeInit = decisionTree(self.dfTrain,
                                        depth=d,
                                        method=method,
                                        numerical=self.numerical,
                                        randTieBreak=self.tie)
                print('Creating DT with depth limit: {} and method: {}...'.
                      format(d, method))
                dt = run_ID3(treeInit)
                print('Tree complete')
                # get errors by applying the tree to both train and test sets
                print('Applying the tree to train and test...')
                self.train_err[i, j] = self._applyAndError(
                    dt, self.dfTrain, treeInit, numerical=self.numerical)
                self.test_err[i, j] = self._applyAndError(
                    dt, self.dfTest, treeInit, numerical=self.numerical)
                print('Applying complete\n')
        print('Done\n')

        return self.train_err, self.test_err
Exemplo n.º 2
0
 def _bagging_loop(self):
     for t in range(self.T):
         if (t) % np.round(self.T / 10) == 0:
             self._bag_progress(t)
         bootstrap = self.draw_with_replacement()
         if self.small_sub:
             tree_init = decisionTree(bootstrap,
                                      numerical=self.numerical,
                                      small_sub=self.small_sub,
                                      globaldf=self.globaldf,
                                      randForest=self.randForest,
                                      Gsize=self.Gsize)
         else:
             tree_init = decisionTree(bootstrap,
                                      numerical=self.numerical,
                                      randForest=self.randForest,
                                      Gsize=self.Gsize)
         self.treesInit.append(tree_init)
         run_ID3(tree_init)
         self._calc_vote(tree_init, t, numerical=self.numerical)
     if self.verbose:
         print('100% done.\n')
 def _AdaLoop(self, D):
     print('Starting training...')
     for t in range(self.T):
         if (t) % np.round(self.T / 10) == 0:
             self._progress(t)
         stump_init = decisionTree(self.data,
                                   numerical=True,
                                   depth=self.depth,
                                   weights=D)
         run_ID3(stump_init)
         self.learners_init.append(stump_init)
         h_t = self._calc_vote(stump_init, t, D, numerical=True)
         Dtp1 = self._update_weights(D, t, h_t)
         D = Dtp1
     print('Done training\n')
Exemplo n.º 4
0
import matplotlib.pyplot as plt
import time

# %% importing the data and splitting it up
cols = list(pd.read_csv('car/data-desc.txt', skiprows=14))
train0 = pd.read_csv('car/train.csv', names=cols)
test0 = pd.read_csv('car/test.csv', names=cols)

attrTrain0 = np.array(train0.iloc[:, :-1])
attrTest0 = np.array(test0.iloc[:, :-1])
attrNames0 = cols[:-1]
labelsTrain0 = np.array(train0.iloc[:, -1])
labelsTest0 = np.array(test0.iloc[:, -1])

# %% training the ID3 algo for testing
carTreeInit = decisionTree(train0, method='entropy')
carTree = run_ID3(carTreeInit)

# %% applying the ID3 algo for testing
car_errinit = applyTree(carTree, test0, carTreeInit)
errs0, total_err0 = apply_ID3(car_errinit)

# %% making trees
tic = time.perf_counter()
methods = ['entropy', 'ME', 'gini']
datTrain0 = [attrTrain0, labelsTrain0, train0]
datTest0 = [attrTest0, labelsTest0, test0]
dfs = [train0, test0]
depths0 = len(attrNames0)

errinit = tester(methods, dfs, depths=depths0)

# %% importing the data
cols = [
    'age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
    'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
    'previous', 'poutcome', 'y'
]
train = pd.read_csv('bank/train.csv', names=cols)
test = pd.read_csv('bank/test.csv', names=cols)
train_no_unk = replace_unk(train.copy())
test_no_unk = replace_unk(test.copy())

# %% training the ID3 algo for testing
tic = time.perf_counter()
bankTreeInit = decisionTree(train, numerical=True)
bankTree = run_ID3(bankTreeInit)

# % applying the ID3 algo for testing
errinit = applyTree(bankTree, train, bankTreeInit, numerical=True)
errs, total_err = apply_ID3(errinit)
toc = time.perf_counter()
print('Time for bank code is {:0.4f} seconds.'.format(toc - tic))

# %% making trees
tic = time.perf_counter()
methods = ['entropy', 'ME', 'gini']
depths = len(train.columns) - 1
dfs = [train, test]

errinit = tester(methods, dfs, depths=depths, numerical=True)
# %%
def entropy(ps):
    return -sum([p * np.log2(p) for p in ps])


# %% Training Data
# making the training data with columns as x1, x2, x3, x4
attributes = np.array([[0, 0, 0, 1, 0, 1, 0], [0, 1, 0, 0, 1, 1, 1],
                       [1, 0, 1, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 1]]).T
# Boolean labels
y = np.array([0, 0, 1, 1, 0, 0, 0]).T
attrNames = [0, 1, 2, 3]

# %% run ID3 on 1a
init1 = decisionTree(attributes, attrNames, y, method='gini')
tree1 = run_ID3(init1)

# %% tennis data
attributes2 = np.array(
    [['S', 'S', 'O', 'R', 'R', 'R', 'O', 'S', 'S', 'R', 'S', 'O', 'O', 'R'],
     ['H', 'H', 'H', 'M', 'C', 'C', 'C', 'M', 'C', 'M', 'M', 'M', 'H', 'M'],
     ['H', 'H', 'H', 'H', 'N', 'N', 'N', 'H', 'N', 'N', 'N', 'H', 'N', 'H'],
     ['W', 'S', 'W', 'W', 'W', 'S', 'S', 'W', 'W', 'W', 'S', 'S', 'W', 'S']]).T
y2 = np.array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0]).T
attrNames2 = ['Outlook', 'Temp', 'Humidity', 'Wind']

init2 = decisionTree(attributes2, attrNames2, y2, method='ME')
tree2 = run_ID3(init2)

# %% ID3 on 3a