示例#1
0
    def main(self):
        if self.eval_method == "training":
            self.train(self.arff.get_features(), self.arff.get_labels())
            self._print_confusion_matrix(self.arff.get_features(),
                                         self.arff.get_labels())
        elif self.eval_method == "random":
            train_features, train_labels, test_features, test_labels = self.training_test_split(
                train_percent=self.eval_parameter)
            self.train(train_features, train_labels)
            self.test(test_features, test_labels)
            self._print_confusion_matrix(test_features, test_labels)

        elif self.eval_method == "static":
            self.train(self.arff.get_features(), self.arff.get_labels())
            arff_file = self.eval_parameter
            test_data = Arff(arff_file)
            if self.normalize:
                test_data.normalize()
            self.test(features=test_data.get_features(),
                      labels=test_data.get_labels())
            self._print_confusion_matrix(features=test_data.get_features(),
                                         labels=test_data.get_labels())

        elif self.eval_method == "cross":
            # print('PARAMETER')
            self.eval_parameter = int(self.eval_parameter)
            self.cross_validate(
                self.eval_parameter)  # confusion matrix not supported for CV
            type(self.eval_parameter)
        else:
            raise Exception("Unrecognized evaluation method '{}'".format(
                self.eval_method))
示例#2
0
class ToolkitSession:
    """ Toolkit session is given a learner with an associated arff file.
            Notes:
                * A learner class can be passed without instantiation. It will be created when the session is started.
                    * Learner keyword arguments can be passed to the session
                    * A learner class can also already be instantiated when passed
        """
    def __init__(self,
                 arff,
                 learner,
                 eval_method=None,
                 eval_parameter=None,
                 print_confusion_matrix=False,
                 normalize=False,
                 random_seed=None,
                 label_count=1,
                 **kwargs):
        """
        Args:
            arff: Can be arff path, numpy array, or arff object
            learner: Learner type or instantiated learner type
            eval_method: training, static (separate test set), random (random test split), cross (cross-validate)
            eval_parameter: "random" - % used for training; static - test set; cross - # of folds;
            print_confusion_matrix (bool): True will print the confuction matrix (only makes sense for classification)
            normalize: Normalize training/test data
            random_seed: Set random seed for deterministic shuffling
            **kwargs:
        """
        # parse the command-line arguments
        if random_seed:
            random.seed(random_seed)
            np.random.seed(random_seed)

        # update class variables
        if inspect.isclass(learner):
            # Instantiate learner if needed
            self.learner = learner(**kwargs)
            self.learner_name = learner.__name__
        else:
            self.learner = learner
            self.learner_name = type(learner).__name__

        self.print_confusion_matrix_flag = print_confusion_matrix
        self.eval_method = eval_method
        self.eval_parameter = eval_parameter
        self.normalize = normalize

        self.training_accuracy = []
        self.test_accuracy = []

        # load the ARFF file
        self.arff = Arff(arff, label_count=label_count)
        if isinstance(arff, Arff):
            arff = arff.dataset_name

        if self.normalize:
            print("Using normalized data")
            self.arff.normalize()

        # print some stats
        print("\nDataset name: {}\n"
              "Number of instances: {}\n"
              "Number of attributes: {}\n"
              "Learning algorithm: {}\n"
              "Evaluation method: {}\n".format(arff, self.arff.shape[0],
                                               self.arff.shape[1],
                                               self.learner_name,
                                               self.eval_method))

        if not eval_method is None:
            self.main()

    def main(self):
        if self.eval_method == "training":
            self.train(self.arff.get_features(), self.arff.get_labels())
            self._print_confusion_matrix(self.arff.get_features(),
                                         self.arff.get_labels())
        elif self.eval_method == "random":
            train_features, train_labels, test_features, test_labels = self.training_test_split(
                train_percent=self.eval_parameter)
            self.train(train_features, train_labels)
            self.test(test_features, test_labels)
            self._print_confusion_matrix(test_features, test_labels)

        elif self.eval_method == "static":
            self.train(self.arff.get_features(), self.arff.get_labels())
            arff_file = self.eval_parameter
            test_data = Arff(arff_file)
            if self.normalize:
                test_data.normalize()
            self.test(features=test_data.get_features(),
                      labels=test_data.get_labels())
            self._print_confusion_matrix(features=test_data.get_features(),
                                         labels=test_data.get_labels())

        elif self.eval_method == "cross":
            # print('PARAMETER')
            self.eval_parameter = int(self.eval_parameter)
            self.cross_validate(
                self.eval_parameter)  # confusion matrix not supported for CV
            type(self.eval_parameter)
        else:
            raise Exception("Unrecognized evaluation method '{}'".format(
                self.eval_method))

    def training_test_split(self, train_percent=.9):
        """ Arff object with labels included
        Args:
            train_percent:

        Returns:
            Tuple: train_features, train_labels, test_features, test_labels
        """
        self.arff.shuffle()

        print("Calculating accuracy on a random hold-out set...")
        train_percent = float(train_percent)
        if train_percent < 0 or train_percent > 1:
            raise Exception(
                "Percentage for random evaluation must be between 0 and 1")
        print("Percentage used for training: {}".format(rnd4(train_percent)))
        print("Percentage used for testing: {}".format(rnd4(1 -
                                                            train_percent)))

        train_size = int(train_percent * self.arff.shape[0])

        train_features = self.arff.get_features(slice(0, train_size))
        train_labels = self.arff.get_labels(slice(0, train_size))

        test_features = self.arff.get_features(slice(train_size, None))
        test_labels = self.arff.get_labels(slice(train_size, None))

        return train_features, train_labels, test_features, test_labels

    def train(self, features=None, labels=None):
        """By default, this trains on entire arff file. Features and labels options are given to e.g.
            train on only a part of the data
        Args:
            features (array-like):
            labels (array-like):
        Returns:

        """
        print("Calculating accuracy on training set...")

        if features is None:
            features = self.arff.get_features()
        if labels is None:
            labels = self.arff.get_labels()

        start_time = time.time()
        self.learner.train(features, labels)
        elapsed_time = time.time() - start_time
        print("Time to train (in seconds): {}".format(rnd4(elapsed_time)))
        accuracy = self.learner.measure_accuracy(features, labels)
        self.training_accuracy.append(accuracy)
        print("Training set accuracy: {}".format(rnd4(accuracy)))

    def test(self, features, labels):
        """ This eval_method 1) creates a 'random' training/test split according to some user-specified percentage,
                            2) trains the data
                            3) reports training AND test accuracy
            """
        test_accuracy = self.learner.measure_accuracy(features, labels)
        self.test_accuracy.append(test_accuracy)
        print("Test set accuracy: {}".format(rnd4(test_accuracy)))

    def _print_confusion_matrix(self, features, labels):
        if self.print_confusion_matrix_flag:
            cm = self.learner.get_confusion_matrix(features, labels)
            print(cm)

    def generate_fold(self, folds):
        for i in range(folds):
            start_test = int(i * self.arff.shape[0] / folds)
            end_test = int((i + 1) * self.arff.shape[0] / folds)

            train_features = self.arff.get_features(
                row_idx=np.r_[0:start_test, end_test:self.arff.shape[0]])
            train_labels = self.arff.get_labels(
                row_idx=np.r_[0:start_test, end_test:self.arff.shape[0]])

            test_features = self.arff.get_features(slice(start_test, end_test))
            test_labels = self.arff.get_labels(slice(start_test, end_test))
            yield train_features, train_labels, test_features, test_labels

    def cross_validate(self, folds, reps=1):
        print("Calculating accuracy using cross-validation...")

        if folds <= 0:
            raise Exception("Number of folds must be greater than 0")
        print("Number of folds: {}".format(folds))
        reps = 1
        sum_accuracy = 0.0
        elapsed_time = 0.0

        for rep_counter in range(reps):
            self.arff.shuffle()

            for fold_counter, [
                    train_features, train_labels, test_features, test_labels
            ] in enumerate(self.generate_fold(folds)):
                start_time = time.time()

                # Train model
                self.learner.train(train_features, train_labels)
                elapsed_time += time.time() - start_time
                training_accuracy = self.learner.measure_accuracy(
                    train_features, train_labels)

                # Get test accuracy
                test_accuracy = self.learner.measure_accuracy(
                    test_features, test_labels)
                sum_accuracy += test_accuracy
                print("Rep={}, Fold={}, Accuracy={}".format(
                    rep_counter, fold_counter, rnd4(test_accuracy)))

                self.training_accuracy.append(training_accuracy)
                self.test_accuracy.append(test_accuracy)

            elapsed_time /= (reps * folds)
            print("Average time to train (in seconds): {}".format(
                rnd4(elapsed_time)))
            print("Mean accuracy={}".format(rnd4(sum_accuracy /
                                                 (reps * folds))))