def test_evaluate_nfold_fail_due_to_invalid_data(): X = 'not a numpy array...' y = '...nor a pandas data structure' model = GaussianNB() num_folds = 3 with pytest.raises(ValueError): evaluate_nfold(X, y, model, num_folds)
def test_evaluate_nfold_fail_due_to_invalid_bootstripping_param(classification_data): X, y = classification_data num_folds = 3 bootstrapping = 'non-Boolean' with pytest.raises(ValueError): evaluate_nfold(X, y, GaussianNB(), num_folds, bootstrapping=bootstrapping)
def test_evaluate_nfold_with_numpy_arrays(classification_data): X, y = classification_data model = GaussianNB() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1) model.fit(X_train, y_train) num_folds = 1 scores = evaluate_nfold(X_test, y_test, model, num_folds) assert len(scores) == num_folds assert all((scores[i]>=0) & (scores[i]<=1) \ for i in range(len(scores))) == True num_folds = 3 scores = evaluate_nfold(X_test, y_test, model, num_folds) assert len(scores) == num_folds assert all((scores[i]>=0) & (scores[i]<=1) \ for i in range(len(scores))) == True
def test_evaluate_nfold_bootstrapping(classification_data): X, y = classification_data model = GaussianNB() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1) model.fit(X_train, y_train) num_folds = 3 scores = evaluate_nfold(pd.DataFrame(X_test), pd.DataFrame(y_test), model, num_folds, bootstrapping=True) assert len(scores) == num_folds assert all((scores[i]>=0) & (scores[i]<=1) \ for i in range(len(scores))) == True
def main(): start_time_main = time.time() print_info('Reading config files...', ':') run_config = config_file_to_dict(config_path + 'run_params.conf') data_config = config_file_to_dict(config_path + 'data_params.conf') model_config = config_file_to_dict(config_path + 'model_params.conf') if run_mode_user in run_config: frac_train_sample = run_config[run_mode_user]['frac_train_sample'] num_test_samples = run_config[run_mode_user]['num_test_samples'] num_CV_folds = run_config[run_mode_user]['num_CV_folds'] do_optimize_params = run_config[run_mode_user]['do_optimize_params'] n_iter = run_config[run_mode_user]['n_iter'] print_info('Chosen run mode is {}: {}'.format( run_mode_user, run_config[run_mode_user])) else: raise KeyError('{} is not a valid run mode setting ' \ '(use, e.g., "run_params")'.format( run_mode_user)) # collection of performance measures to be applied to the test set(s) scoring_funcs = ['accuracy_score', 'precision_score', 'recall_score', \ 'f1_score'] final_results_labels = [ 'dataset', 'model', 'model_params', 'num_test_sets', 'num_CV_folds', 'elapsed_time_train', 'elapsed_time_test' ] final_results_labels += ['test_{}_1fold'.format(i) for i in scoring_funcs] final_results_labels += ['train_{}'.format(i) for i in scoring_funcs] final_results_labels += ['test_{}'.format(i) for i in scoring_funcs] final_results_labels += [ 'test_{}_bootstrap'.format(i) for i in scoring_funcs ] final_results_labels += [ 'test_{}_diff_max'.format(i) for i in scoring_funcs ] final_results_labels += [ 'test_{}_diff_max_bootstrap'.format(i) for i in scoring_funcs ] final_results_labels += [ 'test_{}_diff_mean'.format(i) for i in scoring_funcs ] final_results_labels += [ 'test_{}_diff_std'.format(i) for i in scoring_funcs ] final_results_labels += [ 'test_{}_diff_mean_bootstrap'.format(i) for i in scoring_funcs ] final_results_labels += [ 'test_{}_diff_std_bootstrap'.format(i) for i in scoring_funcs ] final_results = pd.DataFrame(columns=final_results_labels) # loop over all sections of the data params config file for d_cnt, d in enumerate(data_config): print_info( 'Processing dataset: {} ({} of {})'.format(d, d_cnt + 1, len(data_config)), '=', 50) current_data_results = {} current_data_params = data_config[d] check_data_config_requirements(current_data_params) print_info('Loading data...', ':') data = load_data(current_data_params) print_info('Preparing target vector...', ':') X = data.drop(current_data_params['data_target_col'], axis=1) y = data[current_data_params['data_target_col']] y = parse_target_labels( y, current_data_params['data_target_positive_label'], current_data_params['data_target_negative_label']) del data print_info('Dimensions of feature matrix X: {}'.format(X.shape)) print_info('Dimensions of target vector y: {}'.format(y.shape)) print_info('Splitting the data: splitting off the training sample...', ':') X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - frac_train_sample) del X, y print_info('Preprocessing the data...', ':') pm = PreprocessingManager(om.get_session_folder()) for func in current_data_params['data_preprocessing']: X_train = getattr(pm, func)(X_train, False) X_test = getattr(pm, func)(X_test, True) print_class_counts(y_train, 'training', background=0, signal=1) print_class_counts(y_test, 'test', background=0, signal=1) # hyperparameter optimization, if required if num_CV_folds is None: print_info('Optimizing the number of cross-validation folds...', ':') num_CV_folds = get_optimal_CV_n_folds(X_train.as_matrix(), y_train.as_matrix()) for mod in model_config: print_info('Training model: {}'.format(mod), '-', 50) try: model_params = model_config[mod] model = supported_models[mod](**model_params) except KeyError: raise KeyError('Model {} not supported. Choose a valid input ' \ 'from this list: {}'.format(mod, supported_models)) fitkwargs = {'X': X_train, 'y': y_train} if do_optimize_params: print_info('Optimizing hyperparameters...', ':') model = hyperparameter_search(model, n_iter, num_CV_folds) if mod != 'GaussianNB': fitkwargs['callback'] = DeltaXStopper(1e-2) start_time_train = time.time() print_info('Fitting the model...', ':') model.fit(**fitkwargs) elapsed_time_train = time.time() - start_time_train model_parameters = get_search_results(model) # evaluate model on the training sample print_info('Evaluating the model on the training sample...', ':') for scoring_func in scoring_funcs: try: model_scores_train = evaluate_nfold(X_train, y_train, model, 1, scoring=scoring_func) current_data_results['train_{}'.format( scoring_func)] = model_scores_train[0] except ValueError: warnings.warn('ValueError when evaluating with {}. ' \ 'Ignoring and continuing...'.format( scoring_func)) # evaluate model on the test sample(s) print_info('Evaluating the model on the test sample(s)...', ':') test_performance_1fold = -1 # must be initialized with a negative number for t in range(1, num_test_samples + 1): start_time_test = time.time() for scoring_func in scoring_funcs: try: model_scores_test = evaluate_nfold( X_test, y_test, model, t, scoring=scoring_func, bootstrapping=False) model_scores_test_bootstrap = evaluate_nfold( X_test, y_test, model, t, scoring=scoring_func, bootstrapping=True) if test_performance_1fold < 0: test_performance_1fold = model_scores_test[0] else: pass current_data_results['test_{}_1fold'.format( scoring_func)] = test_performance_1fold current_data_results['test_{}'.format( scoring_func)] = str(model_scores_test) current_data_results['test_{}_bootstrap'.format( scoring_func)] = str(model_scores_test_bootstrap) current_data_results['test_{}_diff_max'.format( scoring_func )] = max(model_scores_test) - min(model_scores_test) current_data_results[ 'test_{}_diff_max_bootstrap'.format( scoring_func )] = max(model_scores_test_bootstrap) - min( model_scores_test_bootstrap) scores_mean, scores_std = performance_difference( model_scores_test) current_data_results['test_{}_diff_mean'.format( scoring_func)] = scores_mean current_data_results['test_{}_diff_std'.format( scoring_func)] = scores_std scores_mean_bootstrap, scores_std_bootstrap = performance_difference( model_scores_test_bootstrap) current_data_results[ 'test_{}_diff_mean_bootstrap'.format( scoring_func)] = scores_mean_bootstrap current_data_results[ 'test_{}_diff_std_bootstrap'.format( scoring_func)] = scores_std_bootstrap except ValueError: warnings.warn('ValueError when evaluating with {}. ' \ 'Ignoring and continuing...'.format( scoring_func)) current_data_results['test_{}_1fold'.format( scoring_func)] = -1 #current_data_results['test_{}'.format( # scoring_func)] = "-1" #current_data_results['test_{}_bootstrap'.format( # scoring_func)] = "-1" current_data_results['test_{}_diff_mean'.format( scoring_func)] = -1 current_data_results['test_{}_diff_std'.format( scoring_func)] = -1 current_data_results['test_{}_diff_mean_bootstrap'. format(scoring_func)] = -1 current_data_results['test_{}_diff_std_bootstrap'. format(scoring_func)] = -1 print_info('Model score differences (mean, std) for {} ' \ 'test sample folds: {:.5f}, {:.5f}'.format( t, scores_mean, scores_std)) model_params_string = ','.join('{}:{}'.format(key, val) \ for key, val in \ sorted(model_parameters.items())) current_data_results['dataset'] = str(d) current_data_results['model'] = str(mod) current_data_results['model_params'] = model_params_string current_data_results['num_test_sets'] = t current_data_results['num_CV_folds'] = num_CV_folds current_data_results['elapsed_time_train'] = elapsed_time_train current_data_results['elapsed_time_test'] = time.time( ) - start_time_test final_results = final_results.append(current_data_results, ignore_index=True) print_info('Creating results plots...', ':') scoring_func_plot = 'f1_score' train_differences = [] current_data_plot_nsplits = final_results.query( '(dataset=="{}") & (model=="{}")'.format(d, mod))['num_test_sets'] # explicit conversion to floats is necessary for the np.isfinite method, # which is implicitely called during plotting current_data_plot_xyvals = [ current_data_plot_nsplits.values.astype(np.float32) ] current_data_plot_xyvals_bootstrap = [ current_data_plot_nsplits.values.astype(np.float32) ] current_data_plot_xyvals_max = [ current_data_plot_nsplits.values.astype(np.float32) ] current_data_plot_xyvals_max_bootstrap = [ current_data_plot_nsplits.values.astype(np.float32) ] for mod in model_config: current_data_plot_xyvals.append( final_results.query('(dataset=="{}") & (model=="{}")'.format( d, mod))['test_{}_diff_mean'.format( scoring_func_plot)].values.astype(np.float32)) current_data_plot_xyvals.append( final_results.query('(dataset=="{}") & (model=="{}")'.format( d, mod))['test_{}_diff_std'.format( scoring_func_plot)].values.astype(np.float32)) current_data_plot_xyvals_max.append( final_results.query('(dataset=="{}") & (model=="{}")'.format( d, mod))['test_{}_diff_max'.format( scoring_func_plot)].values.astype(np.float32)) current_data_plot_xyvals_max.append( np.zeros(current_data_plot_xyvals_max[-1].shape)) current_data_plot_xyvals_max_bootstrap.append( final_results.query('(dataset=="{}") & (model=="{}")'.format( d, mod))['test_{}_diff_max_bootstrap'.format( scoring_func_plot)].values.astype(np.float32)) current_data_plot_xyvals_max_bootstrap.append( np.zeros(current_data_plot_xyvals_max_bootstrap[-1].shape)) train_differences.append( abs(final_results.query('(dataset=="{}") & '\ '(model=="{}")'.format( d,mod))['train_{}'.format( scoring_func_plot)].iloc[0] - final_results.query('(dataset=="{}") & (model=="{}")'.format( d,mod))['test_{}_1fold'.format(scoring_func_plot)].iloc[0]) ) current_data_plot_xyvals_bootstrap.append( final_results.query('(dataset=="{}") & (model=="{}")'.format( d, mod))['test_{}_diff_mean_bootstrap'.format( scoring_func_plot)].values.astype(np.float32)) current_data_plot_xyvals_bootstrap.append( final_results.query('(dataset=="{}") & (model=="{}")'.format( d, mod))['test_{}_diff_std_bootstrap'.format( scoring_func_plot)].values.astype(np.float32)) xmax_list = [None] for i in range(10, 100, 10): if num_test_samples > i: xmax_list.append(i) for lim in xmax_list: current_data_plot = plot_performance_diff( *current_data_plot_xyvals, labels=[m for m in model_config], train_difference=train_differences, xmax=lim, xlabel='number of samples', ylabel='mean performance difference') plot_filename = '{}_performance-diff_num-splits_full'.format(d) if lim is not None: plot_filename += '_zoomed-{}'.format(lim) om.save(current_data_plot, plot_filename) current_data_plot = plot_performance_diff( *current_data_plot_xyvals, labels=[m for m in model_config], xmax=lim, xlabel='number of samples', ylabel='mean performance difference') plot_filename = '{}_performance-diff_num-splits'.format(d) if lim is not None: plot_filename += '_zoomed-{}'.format(lim) om.save(current_data_plot, plot_filename) current_data_plot = plot_performance_diff( *current_data_plot_xyvals_max, labels=[m for m in model_config], xmax=lim, xlabel='number of samples', ylabel='maximum performance difference') plot_filename = '{}_performance-diff_max_num-splits'.format(d) if lim is not None: plot_filename += '_zoomed-{}'.format(lim) om.save(current_data_plot, plot_filename) current_data_plot = plot_performance_diff( *current_data_plot_xyvals_max_bootstrap, labels=[m for m in model_config], xmax=lim, xlabel='number of samples', ylabel='maximum performance difference') plot_filename = '{}_performance-diff_max_num-splits_bootstrap'.format( d) if lim is not None: plot_filename += '_zoomed-{}'.format(lim) om.save(current_data_plot, plot_filename) current_data_plot = plot_performance_diff( *current_data_plot_xyvals_bootstrap, labels=[m for m in model_config], train_difference=train_differences, xmax=lim, xlabel='number of samples', ylabel='mean performance difference') plot_filename = '{}_performance-diff_num-splits_full_bootstrap'.format( d) if lim is not None: plot_filename += '_zoomed-{}'.format(lim) om.save(current_data_plot, plot_filename) current_data_plot = plot_performance_diff( *current_data_plot_xyvals_bootstrap, labels=[m for m in model_config], xmax=lim, xlabel='number of samples', ylabel='mean performance difference') plot_filename = '{}_performance-diff_num-splits_bootstrap'.format( d) if lim is not None: plot_filename += '_zoomed-{}'.format(lim) om.save(current_data_plot, plot_filename) print_info('Saving the final results...', ':') om.save(final_results, '{}_final-results'.format(d)) final_results_dict = final_results.to_dict('dict') final_results_dict['relation'] = str(d) # needed for ARFF final_results_dict['description'] = u'' # needed for ARFF om.save(final_results, '{}_final-results'.format(d), to_arff=True) print_info('\n') print_info( 'Everything done. (Elapsed overall time: {} seconds)\n'.format( time.time() - start_time_main))
def test_evaluate_nfold_fail_due_to_invalid_num_folds(classification_data): X, y = classification_data num_folds = 'not an integer' with pytest.raises(ValueError): evaluate_nfold(X, y, GaussianNB(), num_folds)
def test_evaluate_nfold_fail_due_to_zero_folds(classification_data): X, y = classification_data model = GaussianNB() num_folds = 0 with pytest.raises(ValueError): evaluate_nfold(X, y, model, num_folds)
def test_evaluate_nfold_fail_due_to_invalid_scoring_func(classification_data): X, y = classification_data num_folds = 3 scoring_func = 'invalid function' with pytest.raises(ValueError): evaluate_nfold(X, y, GaussianNB(), num_folds, scoring=scoring_func)