예제 #1
0
    def test_unroll_dict_of_lists(self):
        model_param_search_range = {
            'norm_type': ['normalize', 'clip_0to1'],
            'n_estimators': [10, 50],
            'random_state': [0]
        }

        dicts = unroll_dict_of_lists(model_param_search_range)

        expected_dicts = [
            {
                'norm_type': 'normalize',
                'n_estimators': 10,
                'random_state': 0
            },
            {
                'norm_type': 'clip_0to1',
                'n_estimators': 10,
                'random_state': 0
            },
            {
                'norm_type': 'normalize',
                'n_estimators': 50,
                'random_state': 0
            },
            {
                'norm_type': 'clip_0to1',
                'n_estimators': 50,
                'random_state': 0
            },
        ]

        self.assertEquals(dicts, expected_dicts)
예제 #2
0
    def test_unroll_dict_of_lists(self):
        model_param_search_range = {'norm_type':['normalize', 'clip_0to1'],
                                    'n_estimators':[10, 50], 'random_state': [0]}

        dicts = unroll_dict_of_lists(model_param_search_range)

        expected_dicts = [
         {'norm_type':'normalize', 'n_estimators':10, 'random_state':0},
         {'norm_type':'clip_0to1', 'n_estimators':10, 'random_state':0},
         {'norm_type':'normalize', 'n_estimators':50, 'random_state':0},
         {'norm_type':'clip_0to1', 'n_estimators':50, 'random_state':0},
        ]

        self.assertEquals(dicts, expected_dicts)
예제 #3
0
    def run_nested_kfold_cross_validation(
        cls,
        train_test_model_class,
        model_param_search_range,
        results_or_df,
        kfold,
        search_strategy='grid',
        random_search_times=100,
        logger=None,
        optional_dict2=None,
    ):
        """
        Nested k-fold cross validation, given hyper-parameter search range. The
        search range is specified in the format of, e.g.:
        {'norm_type':['normalize', 'clip_0to1', 'clip_minus1to1'],
         'n_estimators':[10, 50],
         'random_state': [0]}
        :param train_test_model_class:
        :param model_param_search_range:
        :param results_or_df: list of BasicResult, or pandas.DataFrame
        :param kfold: if it is an integer, it is the number of folds; if it is
        lists of indices, then each list contains row indices of the dataframe
        selected as one fold
        :param search_strategy: either 'grid' or 'random'
        :return: output
        """

        if isinstance(kfold, (int, long)):
            kfold_type = 'int'
        elif isinstance(kfold, (list, tuple)):
            kfold_type = 'list'
        else:
            assert False, 'kfold must be either a list of lists or an integer.'

        # if input is integer (e.g. 4), reconstruct kfold in list of indices
        # format
        if kfold_type == 'int':
            num_fold = kfold
            dataframe_size = len(results_or_df)
            fold_size = int(floor(dataframe_size / num_fold))
            kfold = []
            for fold in range(num_fold):
                index_start = fold * fold_size
                index_end = min((fold + 1) * fold_size, dataframe_size)
                kfold.append(range(index_start, index_end))

        assert len(kfold) >= 3, 'kfold list must have length >= 2 for nested ' \
                                'k-fold cross validation.'

        if search_strategy == 'grid':
            cls._assert_grid_search(model_param_search_range)
            list_model_param = unroll_dict_of_lists(model_param_search_range)
        elif search_strategy == 'random':
            cls._assert_random_search(model_param_search_range)
            list_model_param = cls._sample_model_param_list(
                model_param_search_range, random_search_times)
        else:
            assert False, "Unknown search_strategy: {}".format(search_strategy)

        statss = []
        model_params = []
        contentids = []

        for fold in range(len(kfold)):

            if logger:
                logger.info("Fold {}...".format(fold))

            test_index_range = kfold[fold]
            train_index_range = []
            train_index_range_in_list_of_indices = []

            # in this case, train_index_range is list of lists
            for train_fold in range(len(kfold)):
                if train_fold != fold:
                    train_index_range += kfold[train_fold]
                    train_index_range_in_list_of_indices.append(
                        kfold[train_fold])

            # iterate through all possible combinations of model_params
            best_model_param = None
            best_stats = None
            for model_param in list_model_param:

                if logger:
                    logger.info("\tModel parameter: {}".format(model_param))

                output = \
                    cls.run_kfold_cross_validation(train_test_model_class,
                                                   model_param,
                                                   results_or_df,
                                                   train_index_range_in_list_of_indices,
                                                   optional_dict2)
                stats = output['aggr_stats']

                if (best_stats is
                        None) or (train_test_model_class.get_objective_score(
                            stats, type='SRCC') >
                                  train_test_model_class.get_objective_score(
                                      best_stats, type='SRCC')):
                    best_stats = stats
                    best_model_param = model_param

            # run cross validation based on best model parameters
            output_ = cls.run_cross_validation(train_test_model_class,
                                               best_model_param, results_or_df,
                                               train_index_range,
                                               test_index_range,
                                               optional_dict2)
            stats_ = output_['stats']

            statss.append(stats_)
            model_params.append(best_model_param)

            contentids += list(output_['contentids'])

        aggr_stats = train_test_model_class.aggregate_stats_list(statss)
        top_model_param, count = cls._find_most_frequent_dict(model_params)

        assert contentids is not None
        output__ = {
            'aggr_stats': aggr_stats,
            'top_model_param': top_model_param,
            'top_ratio': float(count) / len(model_params),
            'statss': statss,
            'model_params': model_params,
            'contentids': contentids,
        }

        return output__
예제 #4
0
    def run_nested_kfold_cross_validation(cls,
                                          train_test_model_class,
                                          model_param_search_range,
                                          results_or_df,
                                          kfold,
                                          search_strategy='grid',
                                          random_search_times=100,
                                          logger=None,
                                          optional_dict2=None,
                                          ):
        """
        Nested k-fold cross validation, given hyper-parameter search range. The
        search range is specified in the format of, e.g.:
        {'norm_type':['normalize', 'clip_0to1', 'clip_minus1to1'],
         'n_estimators':[10, 50],
         'random_state': [0]}
        :param train_test_model_class:
        :param model_param_search_range:
        :param results_or_df: list of BasicResult, or pandas.DataFrame
        :param kfold: if it is an integer, it is the number of folds; if it is
        lists of indices, then each list contains row indices of the dataframe
        selected as one fold
        :param search_strategy: either 'grid' or 'random'
        :return: output
        """

        if isinstance(kfold, (int, long)):
            kfold_type = 'int'
        elif isinstance(kfold, (list, tuple)):
            kfold_type = 'list'
        else:
            assert False, 'kfold must be either a list of lists or an integer.'

        # if input is integer (e.g. 4), reconstruct kfold in list of indices
        # format
        if kfold_type == 'int':
            num_fold = kfold
            dataframe_size = len(results_or_df)
            fold_size = int(floor(dataframe_size / num_fold))
            kfold = []
            for fold in range(num_fold):
                index_start = fold * fold_size
                index_end = min((fold+1)*fold_size, dataframe_size)
                kfold.append(range(index_start, index_end))

        assert len(kfold) >= 3, 'kfold list must have length >= 2 for nested ' \
                                'k-fold cross validation.'

        if search_strategy == 'grid':
            cls._assert_grid_search(model_param_search_range)
            list_model_param = unroll_dict_of_lists(
                model_param_search_range)
        elif search_strategy == 'random':
            cls._assert_random_search(model_param_search_range)
            list_model_param = cls._sample_model_param_list(
                model_param_search_range, random_search_times)
        else:
            assert False, "Unknown search_strategy: {}".format(search_strategy)

        statss = []
        model_params = []
        contentids = []

        for fold in range(len(kfold)):

            if logger:
                logger.info("Fold {}...".format(fold))

            test_index_range = kfold[fold]
            train_index_range = []
            train_index_range_in_list_of_indices = []

            # in this case, train_index_range is list of lists
            for train_fold in range(len(kfold)):
                if train_fold != fold:
                    train_index_range += kfold[train_fold]
                    train_index_range_in_list_of_indices.append(kfold[train_fold])

            # iterate through all possible combinations of model_params
            best_model_param = None
            best_stats = None
            for model_param in list_model_param:

                if logger:
                    logger.info("\tModel parameter: {}".format(model_param))

                output = \
                    cls.run_kfold_cross_validation(train_test_model_class,
                                                   model_param,
                                                   results_or_df,
                                                   train_index_range_in_list_of_indices,
                                                   optional_dict2)
                stats = output['aggr_stats']

                if (best_stats is None) or (
                    train_test_model_class.get_objective_score(stats, type='SRCC')
                    >
                    train_test_model_class.get_objective_score(best_stats, type='SRCC')
                ):
                    best_stats = stats
                    best_model_param = model_param

            # run cross validation based on best model parameters
            output_ = cls.run_cross_validation(train_test_model_class,
                                              best_model_param,
                                              results_or_df,
                                              train_index_range,
                                              test_index_range,
                                               optional_dict2)
            stats_ = output_['stats']

            statss.append(stats_)
            model_params.append(best_model_param)

            contentids += list(output_['contentids'])

        aggr_stats = train_test_model_class.aggregate_stats_list(statss)
        top_model_param, count = cls._find_most_frequent_dict(model_params)

        assert contentids is not None
        output__ = {
            'aggr_stats':aggr_stats,
            'top_model_param':top_model_param,
            'top_ratio':float(count) / len(model_params),
            'statss':statss,
            'model_params':model_params,
            'contentids':contentids,
        }

        return output__