def train_helper(node, depth): if depth == 0 or len(set(node.targets)) == 1 or len( node.examples[0]) == 0: return # identifies the best attribute for the current node max_idx = 0 max_gain = ID3.gain(node.targets, DataHandler.column(node.examples, max_idx)) for i in range(len(node.examples[0])): if ID3.gain(node.targets, DataHandler.column(node.examples, i)) > max_gain: max_idx, max_gain = i, ID3.gain( node.targets, DataHandler.column(node.examples, i)) node.attribute = max_idx # creates a dictionary of children associated with attribute values for attr_val in set(DataHandler.column(node.examples, max_idx)): node.children[attr_val] = ID3Node(parent=node) idx = 0 for row in node.examples: node.children[row[max_idx]].examples.append(row.copy()) node.children[row[max_idx]].targets.append(node.targets[idx]) idx += 1 for key, child in node.children.items(): DataHandler.rm_column(child.examples, node.attribute) ID3.train_helper(child, depth - 1)
def k_fold_cross_val(self, k=5): model = copy.deepcopy(self.model) examples = copy.deepcopy(model.examples) targets = copy.deepcopy(model.targets) dataset = [] # retrieve and shuffle dataset for i in range(len(examples)): dataset.append(examples[i]) dataset[i].append(targets[i]) DataHandler.shuffle_dataset(dataset) # divide dataset into k parts fold_len = math.ceil(len(dataset) / k) folds = [] for i in range(k): base = i * fold_len limit = (i + 1) * fold_len folds.append(dataset[base:limit]) # allow each fold to be the cross validation set once training_sets = [[] for i in range(k)] test_sets = [] for i in range(k): for j in range(k): if i == j: test_sets.append(folds[j]) else: training_sets[i] += folds[j] accuracy = [] for i in range(k): training = copy.deepcopy(training_sets[i]) test = copy.deepcopy(test_sets[i]) training_targets = DataHandler.column(training_sets[i], -1) test_targets = DataHandler.column(test_sets[i], -1) DataHandler.rm_column(training, -1) DataHandler.rm_column(test, -1) # train the model on training set i self.model.targets = training_targets self.model.examples = training self.model.train() # test model on test set i classifications = self.classify_set(test) # get accuracy accuracy.append( ModelEvaluator.compute_accuracy(classifications, test_targets)) avg_accuracy = sum(accuracy) / len(accuracy) return avg_accuracy