Пример #1
0
    def train_helper(node, depth):
        if depth == 0 or len(set(node.targets)) == 1 or len(
                node.examples[0]) == 0:
            return

        # identifies the best attribute for the current node
        max_idx = 0
        max_gain = ID3.gain(node.targets,
                            DataHandler.column(node.examples, max_idx))
        for i in range(len(node.examples[0])):
            if ID3.gain(node.targets, DataHandler.column(node.examples,
                                                         i)) > max_gain:
                max_idx, max_gain = i, ID3.gain(
                    node.targets, DataHandler.column(node.examples, i))
        node.attribute = max_idx

        # creates a dictionary of children associated with attribute values
        for attr_val in set(DataHandler.column(node.examples, max_idx)):
            node.children[attr_val] = ID3Node(parent=node)

        idx = 0
        for row in node.examples:
            node.children[row[max_idx]].examples.append(row.copy())
            node.children[row[max_idx]].targets.append(node.targets[idx])
            idx += 1

        for key, child in node.children.items():
            DataHandler.rm_column(child.examples, node.attribute)
            ID3.train_helper(child, depth - 1)
Пример #2
0
    def k_fold_cross_val(self, k=5):
        model = copy.deepcopy(self.model)
        examples = copy.deepcopy(model.examples)
        targets = copy.deepcopy(model.targets)
        dataset = []
        # retrieve and shuffle dataset
        for i in range(len(examples)):
            dataset.append(examples[i])
            dataset[i].append(targets[i])
        DataHandler.shuffle_dataset(dataset)

        # divide dataset into k parts
        fold_len = math.ceil(len(dataset) / k)
        folds = []
        for i in range(k):
            base = i * fold_len
            limit = (i + 1) * fold_len
            folds.append(dataset[base:limit])

        # allow each fold to be the cross validation set once
        training_sets = [[] for i in range(k)]
        test_sets = []
        for i in range(k):
            for j in range(k):
                if i == j:
                    test_sets.append(folds[j])
                else:
                    training_sets[i] += folds[j]

        accuracy = []
        for i in range(k):
            training = copy.deepcopy(training_sets[i])
            test = copy.deepcopy(test_sets[i])
            training_targets = DataHandler.column(training_sets[i], -1)
            test_targets = DataHandler.column(test_sets[i], -1)
            DataHandler.rm_column(training, -1)
            DataHandler.rm_column(test, -1)

            # train the model on training set i
            self.model.targets = training_targets
            self.model.examples = training
            self.model.train()

            # test model on test set i
            classifications = self.classify_set(test)

            # get accuracy
            accuracy.append(
                ModelEvaluator.compute_accuracy(classifications, test_targets))

        avg_accuracy = sum(accuracy) / len(accuracy)

        return avg_accuracy