def fold(self, num_folds): r""" Compute and return a partition of the sample supplied during instantiation, stratifying on the properties summarized by the additional lists or tuples passed to the contstructor. :param num_folds: number of folds in the partition. :type num_folds: integer :returns: the sample partition. :rtype: list of lists EXAMPLES In order to show how properties are specified let's consider the following toy sample >>> from yaplf.data import Example >>> sample = 5*(Example(10),) + 7*(Example(11),) + 8*(Example(12),) +\ ... 10*(Example(-10),) + 20*(Example(-11),) + 50*(Example(-12),) That is, sample will contain five copies of the item `10`, seven copies of `11` and so on. Suppose that each item is functionally associated to a *class* and to a *quality profile* as follows: - the most significative digit (including its sign) identifies the item's class; - the less significative digit identifies the item's quality. Therefore, items in the above mentioned sample can belong either to class `1` or to class `-1`, and their quality range from `0` to `2`. For instance, the first item (`10`) belongs to class `1` and has `0` as quality. So, it is easy to check that this sample contains 50 elements having quality 0 and belonging to class `1`, 80 elements belonging to class `-1` and so on. In spite of the fact that each item explicitly include its values for class and quality (this is not really what happens in the real world), let's also build two lists containing the class and the quality of each item: >>> classes = 5*(1,) + 7*(1,) + 8*(1,) + 10*(-1,) + 20*(-1,) + 50*(-1,) >>> qualities = 5*(0,) + 7*(1,) + 8*(2,) + 10*(0,) + 20*(1,) + 50*(2,) Given variables describing data, classes and quality it is possible to create an instance of :class:`StratifiedSampleFolder` and invoke its :meth:`fold` method: >>> from yaplf.utility.folding import StratifiedSampleFolder >>> strat = StratifiedSampleFolder(sample, classes, qualities) >>> partition = strat.fold(5) >>> partition #doctest: +NORMALIZE_WHITESPACE [[Example(10), Example(11), Example(12), Example(-10), Example(-10), Example(-11), Example(-11), Example(-11), Example(-11), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12)], [Example(10), Example(11), Example(12), Example(12), Example(-10), Example(-10), Example(-11), Example(-11), Example(-11), Example(-11), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12)], [Example(10), Example(11), Example(11), Example(12), Example(-10), Example(-10), Example(-11), Example(-11), Example(-11), Example(-11), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12)], [Example(10), Example(11), Example(12), Example(12), Example(-10), Example(-10), Example(-11), Example(-11), Example(-11), Example(-11), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12)], [Example(10), Example(11), Example(11), Example(12), Example(12), Example(-10), Example(-10), Example(-11), Example(-11), Example(-11), Example(-11), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12), Example(-12)]] In order to check whether or not the partitioning did preserve the percentage of elements belonging to the two classes in each fold (recall the whole sample contained 80% of items in class `-1`) it is possible to count the percentage of such elements within the various folds: >>> [100*len([item for item in fold if item.pattern<0])/len(fold) \ ... for fold in partition] [84, 80, 80, 80, 76] Analogously, it is possible to check that the percentage of items having `2` as quality measure is approximately 58% (it is easy to obtain this percentage through inspection of the above sample definition): >>> [100*len([item for item in fold \ ... if str(item.pattern)[-1]=='2'])/len(fold) for fold in partition] [57, 60, 55, 60, 57] AUTHORS: - Dario Malchiodi (2011-01-21) """ SampleFolder._check_and_shuffle(self) distinct_data = tuple([tuple(set(data)) \ for data in self.stratification_data]) distinct_combinations = cartesian_product(*distinct_data) groups_with_equal_combination = [[self.sample[pos] \ for pos in range(len(self.sample)) \ if tuple(data[pos] \ for data in self.stratification_data) == combination] \ for combination in distinct_combinations] partitioned_groups = [SampleFolder.partition(group, num_folds) \ for group in groups_with_equal_combination] return [flatten([group[fold] for group in partitioned_groups]) \ for fold in range(num_folds)]
def cross_validation(learning_algorithm, sample, parameters_description, \ **kwargs): r""" Perform cross validation on a given sample using a fixed learning algorithm and a set of possible named parameter values. :param learning_algorithm: -- learning algorithm to be used. :type learning_algorithm: :class:`yaplf.algorithms.LearningAlgorithm` :param sample: sample to be cross validated. :type sample: list or tuple of :class:`yaplf.data.Example` :param parameters_description: candidate values for parameters of the learning algorithm :type parameters_description: dictionary whose entries have as key a string describing a paramter's name and as value a list or tuple enclosing candidate values. :param num_folds: number of folds of the provided sample. :type num_folds: integer, default: 5) :param verbose: flag triggering verbose output. :type verbose: boolean, default: ``False`` :param fixed_parameters: parameters of the learning algorithm whose value does not change in the various cross validation steps. :type fixed_parameters: dictionary with parameters name as keys, default: {} :param error_measure: function to be used in order to average test errors on the various sample chunks. :type error_measure: function taking a list/tuple as argument and returning a float, default: numpy.mean :param run_parameters: parameters to be passed to the :meth:`run` method of the learning algorithm (forwarded to :meth:`train_and_test`). :type run_parameters: dictionary with parameters name as keys, default: {} :param error_model: error model to be used in order to evaluate the test error of a single chunk (forwarded to :meth:`train_and_test`). :type error_model: :class:`yaplf.utility.error.ErrorModel`, default: :class:`yaplf.utility.error.MSE` :returns: model trained on all data using the parameters optimizing the cross validation performance :rtype: :class:`yaplf.models.Model` EXAMPLES: The following instructions perform a cross validation on a given sample in the aim of selecting the best combination for two parameters' values, where each can be chosen in two ways. As :class:`yaplf.algorithms.IdiotAlgorithm` has no plasticity, there is no true learning process. Anyway, setting the verbosity flag in :obj:`fixed_parameters` and through the corresponding named argument in :meth:`cross_validation` allows to see how the whole selection process go through examining the various parameters' choices, and for each of those how the sample chunks are generated and processed: >>> from yaplf.data import LabeledExample >>> from yaplf.algorithms import IdiotAlgorithm >>> from yaplf.utility.validation import cross_validation >>> sample = (LabeledExample((0, 0), (0,)), ... LabeledExample((1, 1), (1,)), LabeledExample((2, 2), (1,)), ... LabeledExample((3, 3), (1,)), LabeledExample((4, 4), (0,)), ... LabeledExample((5, 5), (1,)), LabeledExample((6, 6), (1,)), ... LabeledExample((7, 7), (1,)), LabeledExample((8, 8), (0,)), ... LabeledExample((9, 9), (1,))) >>> cross_validation(IdiotAlgorithm, sample, ... {'c': (1, 10), 'sigma': (.1, .01)}, ... fixed_parameters = {'verbose': False}, ... run_parameters = {'num_iters': 100}, num_folds = 3, ... verbose = False) ConstantModel(0) It is important to point out that cross validation works only on named parameters, thus any implementation of :class:`yaplf.algorithms.LearningAlgorithm` subclasses should be designed with this requirement in mind. AUTHORS - Dario Malchiodi (2010-02-22) """ try: num_folds = kwargs['num_folds'] del kwargs['num_folds'] except KeyError: num_folds = 5 try: verbose = kwargs['verbose'] except KeyError: verbose = False split_sample = split(sample, \ num_folds * (1.0 / num_folds,), random=False) parameters_candidate = cartesian_product(*parameters_description.values()) errors = [cross_validation_step(learning_algorithm, \ dict(zip(parameters_description.keys(), params)), split_sample, \ **kwargs) for params in parameters_candidate] if verbose: print 'Errors: ' + str(errors) min_index = array(errors).argmin() best_parameters = parameters_candidate[min_index] if verbose: print 'Minimum error in position ' + str(min_index) print 'Selected parameters ' + str(best_parameters) try: fixed_parameters = kwargs['fixed_parameters'] except KeyError: fixed_parameters = {} try: run_parameters = kwargs['run_parameters'] except KeyError: run_parameters = {} final_parameters = dict(zip(parameters_description.keys(), \ best_parameters)) final_parameters.update(fixed_parameters) return __final_learning(sample, learning_algorithm, final_parameters, \ run_parameters)