def run_kfold_cross_validation(cls, train_test_model_class, model_param, results_or_df, kfold, logger=None): """ Standard k-fold cross validation, given hyper-parameter set model_param :param train_test_model_class: :param model_param: :param results_or_df: list of BasicResult, or pandas.DataFrame :param kfold: if it is an integer, it is the number of folds; if it is list of indices, then each list contains row indices of the dataframe selected as one fold :return: output """ if isinstance(kfold, (int, long)): kfold_type = 'int' elif isinstance(kfold, (list, tuple)): kfold_type = 'list' else: assert False, 'kfold must be either a list of lists or an integer.' # if input is integer (e.g. 4), reconstruct kfold in list of indices # format if kfold_type == 'int': num_fold = kfold dataframe_size = len(results_or_df) fold_size = int(floor(dataframe_size / num_fold)) kfold = [] for fold in range(num_fold): index_start = fold * fold_size index_end = min((fold+1)*fold_size, dataframe_size) kfold.append(range(index_start, index_end)) assert len(kfold) >= 2, 'kfold list must have length >= 2 for k-fold ' \ 'cross validation.' statss = [] models = [] contentids = [] for fold in range(len(kfold)): if logger: logger.info("Fold {}...".format(fold)) test_index_range = kfold[fold] train_index_range = [] for train_fold in range(len(kfold)): if train_fold != fold: train_index_range += kfold[train_fold] output = cls.run_cross_validation(train_test_model_class, model_param, results_or_df, train_index_range, test_index_range) stats = output['stats'] model = output['model'] statss.append(stats) models.append(model) contentids += list(output['contentids']) aggr_stats = TrainTestModel.aggregate_stats_list(statss) output = {} output['aggr_stats'] = aggr_stats output['statss'] = statss output['models'] = models assert contentids is not None output['contentids'] = contentids return output
def run_nested_kfold_cross_validation(cls, train_test_model_class, model_param_search_range, results_or_df, kfold, search_strategy='grid', random_search_times=100, logger=None): """ Nested k-fold cross validation, given hyper-parameter search range. The search range is specified in the format of, e.g.: {'norm_type':['normalize', 'clip_0to1', 'clip_minus1to1'], 'n_estimators':[10, 50], 'random_state': [0]} :param train_test_model_class: :param model_param_search_range: :param results_or_df: list of BasicResult, or pandas.DataFrame :param kfold: if it is an integer, it is the number of folds; if it is lists of indices, then each list contains row indices of the dataframe selected as one fold :param search_strategy: either 'grid' or 'random' :return: output """ if isinstance(kfold, (int, long)): kfold_type = 'int' elif isinstance(kfold, (list, tuple)): kfold_type = 'list' else: assert False, 'kfold must be either a list of lists or an integer.' # if input is integer (e.g. 4), reconstruct kfold in list of indices # format if kfold_type == 'int': num_fold = kfold dataframe_size = len(results_or_df) fold_size = int(floor(dataframe_size / num_fold)) kfold = [] for fold in range(num_fold): index_start = fold * fold_size index_end = min((fold+1)*fold_size, dataframe_size) kfold.append(range(index_start, index_end)) assert len(kfold) >= 3, 'kfold list must have length >= 2 for nested ' \ 'k-fold cross validation.' if search_strategy == 'grid': cls._assert_grid_search(model_param_search_range) list_model_param = cls._unroll_dict_of_lists( model_param_search_range) elif search_strategy == 'random': cls._assert_random_search(model_param_search_range) list_model_param = cls._sample_model_param_list( model_param_search_range, random_search_times) else: assert False, "Unknown search_strategy: {}".format(search_strategy) statss = [] model_params = [] contentids = [] for fold in range(len(kfold)): if logger: logger.info("Fold {}...".format(fold)) test_index_range = kfold[fold] train_index_range = [] train_index_range_in_list_of_indices = [] # in this case, train_index_range is list of lists for train_fold in range(len(kfold)): if train_fold != fold: train_index_range += kfold[train_fold] train_index_range_in_list_of_indices.append(kfold[train_fold]) # iterate through all possible combinations of model_params best_model_param = None best_stats = None for model_param in list_model_param: if logger: logger.info("\tModel parameter: {}".format(model_param)) output = \ cls.run_kfold_cross_validation(train_test_model_class, model_param, results_or_df, train_index_range_in_list_of_indices) stats = output['aggr_stats'] if (best_stats is None) or ( TrainTestModel.get_objective_score(stats, type='SRCC') > TrainTestModel.get_objective_score(best_stats, type='SRCC') ): best_stats = stats best_model_param = model_param # run cross validation based on best model parameters output_ = cls.run_cross_validation(train_test_model_class, best_model_param, results_or_df, train_index_range, test_index_range) stats_ = output_['stats'] statss.append(stats_) model_params.append(best_model_param) contentids += list(output_['contentids']) aggr_stats = TrainTestModel.aggregate_stats_list(statss) top_model_param, count = cls._find_most_frequent_dict(model_params) assert contentids is not None output__ = { 'aggr_stats':aggr_stats, 'top_model_param':top_model_param, 'top_ratio':float(count) / len(model_params), 'statss':statss, 'model_params':model_params, 'contentids':contentids, } return output__