def test_unroll_dict_of_lists(self): model_param_search_range = { 'norm_type': ['normalize', 'clip_0to1'], 'n_estimators': [10, 50], 'random_state': [0] } dicts = unroll_dict_of_lists(model_param_search_range) expected_dicts = [ { 'norm_type': 'normalize', 'n_estimators': 10, 'random_state': 0 }, { 'norm_type': 'clip_0to1', 'n_estimators': 10, 'random_state': 0 }, { 'norm_type': 'normalize', 'n_estimators': 50, 'random_state': 0 }, { 'norm_type': 'clip_0to1', 'n_estimators': 50, 'random_state': 0 }, ] self.assertEquals(dicts, expected_dicts)
def test_unroll_dict_of_lists(self): model_param_search_range = {'norm_type':['normalize', 'clip_0to1'], 'n_estimators':[10, 50], 'random_state': [0]} dicts = unroll_dict_of_lists(model_param_search_range) expected_dicts = [ {'norm_type':'normalize', 'n_estimators':10, 'random_state':0}, {'norm_type':'clip_0to1', 'n_estimators':10, 'random_state':0}, {'norm_type':'normalize', 'n_estimators':50, 'random_state':0}, {'norm_type':'clip_0to1', 'n_estimators':50, 'random_state':0}, ] self.assertEquals(dicts, expected_dicts)
def run_nested_kfold_cross_validation( cls, train_test_model_class, model_param_search_range, results_or_df, kfold, search_strategy='grid', random_search_times=100, logger=None, optional_dict2=None, ): """ Nested k-fold cross validation, given hyper-parameter search range. The search range is specified in the format of, e.g.: {'norm_type':['normalize', 'clip_0to1', 'clip_minus1to1'], 'n_estimators':[10, 50], 'random_state': [0]} :param train_test_model_class: :param model_param_search_range: :param results_or_df: list of BasicResult, or pandas.DataFrame :param kfold: if it is an integer, it is the number of folds; if it is lists of indices, then each list contains row indices of the dataframe selected as one fold :param search_strategy: either 'grid' or 'random' :return: output """ if isinstance(kfold, (int, long)): kfold_type = 'int' elif isinstance(kfold, (list, tuple)): kfold_type = 'list' else: assert False, 'kfold must be either a list of lists or an integer.' # if input is integer (e.g. 4), reconstruct kfold in list of indices # format if kfold_type == 'int': num_fold = kfold dataframe_size = len(results_or_df) fold_size = int(floor(dataframe_size / num_fold)) kfold = [] for fold in range(num_fold): index_start = fold * fold_size index_end = min((fold + 1) * fold_size, dataframe_size) kfold.append(range(index_start, index_end)) assert len(kfold) >= 3, 'kfold list must have length >= 2 for nested ' \ 'k-fold cross validation.' if search_strategy == 'grid': cls._assert_grid_search(model_param_search_range) list_model_param = unroll_dict_of_lists(model_param_search_range) elif search_strategy == 'random': cls._assert_random_search(model_param_search_range) list_model_param = cls._sample_model_param_list( model_param_search_range, random_search_times) else: assert False, "Unknown search_strategy: {}".format(search_strategy) statss = [] model_params = [] contentids = [] for fold in range(len(kfold)): if logger: logger.info("Fold {}...".format(fold)) test_index_range = kfold[fold] train_index_range = [] train_index_range_in_list_of_indices = [] # in this case, train_index_range is list of lists for train_fold in range(len(kfold)): if train_fold != fold: train_index_range += kfold[train_fold] train_index_range_in_list_of_indices.append( kfold[train_fold]) # iterate through all possible combinations of model_params best_model_param = None best_stats = None for model_param in list_model_param: if logger: logger.info("\tModel parameter: {}".format(model_param)) output = \ cls.run_kfold_cross_validation(train_test_model_class, model_param, results_or_df, train_index_range_in_list_of_indices, optional_dict2) stats = output['aggr_stats'] if (best_stats is None) or (train_test_model_class.get_objective_score( stats, type='SRCC') > train_test_model_class.get_objective_score( best_stats, type='SRCC')): best_stats = stats best_model_param = model_param # run cross validation based on best model parameters output_ = cls.run_cross_validation(train_test_model_class, best_model_param, results_or_df, train_index_range, test_index_range, optional_dict2) stats_ = output_['stats'] statss.append(stats_) model_params.append(best_model_param) contentids += list(output_['contentids']) aggr_stats = train_test_model_class.aggregate_stats_list(statss) top_model_param, count = cls._find_most_frequent_dict(model_params) assert contentids is not None output__ = { 'aggr_stats': aggr_stats, 'top_model_param': top_model_param, 'top_ratio': float(count) / len(model_params), 'statss': statss, 'model_params': model_params, 'contentids': contentids, } return output__
def run_nested_kfold_cross_validation(cls, train_test_model_class, model_param_search_range, results_or_df, kfold, search_strategy='grid', random_search_times=100, logger=None, optional_dict2=None, ): """ Nested k-fold cross validation, given hyper-parameter search range. The search range is specified in the format of, e.g.: {'norm_type':['normalize', 'clip_0to1', 'clip_minus1to1'], 'n_estimators':[10, 50], 'random_state': [0]} :param train_test_model_class: :param model_param_search_range: :param results_or_df: list of BasicResult, or pandas.DataFrame :param kfold: if it is an integer, it is the number of folds; if it is lists of indices, then each list contains row indices of the dataframe selected as one fold :param search_strategy: either 'grid' or 'random' :return: output """ if isinstance(kfold, (int, long)): kfold_type = 'int' elif isinstance(kfold, (list, tuple)): kfold_type = 'list' else: assert False, 'kfold must be either a list of lists or an integer.' # if input is integer (e.g. 4), reconstruct kfold in list of indices # format if kfold_type == 'int': num_fold = kfold dataframe_size = len(results_or_df) fold_size = int(floor(dataframe_size / num_fold)) kfold = [] for fold in range(num_fold): index_start = fold * fold_size index_end = min((fold+1)*fold_size, dataframe_size) kfold.append(range(index_start, index_end)) assert len(kfold) >= 3, 'kfold list must have length >= 2 for nested ' \ 'k-fold cross validation.' if search_strategy == 'grid': cls._assert_grid_search(model_param_search_range) list_model_param = unroll_dict_of_lists( model_param_search_range) elif search_strategy == 'random': cls._assert_random_search(model_param_search_range) list_model_param = cls._sample_model_param_list( model_param_search_range, random_search_times) else: assert False, "Unknown search_strategy: {}".format(search_strategy) statss = [] model_params = [] contentids = [] for fold in range(len(kfold)): if logger: logger.info("Fold {}...".format(fold)) test_index_range = kfold[fold] train_index_range = [] train_index_range_in_list_of_indices = [] # in this case, train_index_range is list of lists for train_fold in range(len(kfold)): if train_fold != fold: train_index_range += kfold[train_fold] train_index_range_in_list_of_indices.append(kfold[train_fold]) # iterate through all possible combinations of model_params best_model_param = None best_stats = None for model_param in list_model_param: if logger: logger.info("\tModel parameter: {}".format(model_param)) output = \ cls.run_kfold_cross_validation(train_test_model_class, model_param, results_or_df, train_index_range_in_list_of_indices, optional_dict2) stats = output['aggr_stats'] if (best_stats is None) or ( train_test_model_class.get_objective_score(stats, type='SRCC') > train_test_model_class.get_objective_score(best_stats, type='SRCC') ): best_stats = stats best_model_param = model_param # run cross validation based on best model parameters output_ = cls.run_cross_validation(train_test_model_class, best_model_param, results_or_df, train_index_range, test_index_range, optional_dict2) stats_ = output_['stats'] statss.append(stats_) model_params.append(best_model_param) contentids += list(output_['contentids']) aggr_stats = train_test_model_class.aggregate_stats_list(statss) top_model_param, count = cls._find_most_frequent_dict(model_params) assert contentids is not None output__ = { 'aggr_stats':aggr_stats, 'top_model_param':top_model_param, 'top_ratio':float(count) / len(model_params), 'statss':statss, 'model_params':model_params, 'contentids':contentids, } return output__