def visualize_results(results_file_path, outdir, method_names): """ Produces the performance visualizations/comparisons from the cross-validation results. Parameters ---------- results_file_path outdir method_names """ dataset_paths, method_names, train_perc, num_repetitions, num_classes, \ pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \ best_min_leaf_size, best_num_predictors, \ feature_importances_rf, feature_names, \ num_times_misclfd, num_times_tested, \ confusion_matrix, class_order, accuracy_balanced, auc_weighted, positive_class = \ rhst.load_results(results_file_path) if os.environ['DISPLAY'] is None: warnings.warn( 'DISPLAY is not set. Skipping to generate any visualizations.') return try: balacc_fig_path = os.path.join(outdir, 'balanced_accuracy') visualize.metric_distribution(accuracy_balanced, method_names, balacc_fig_path, num_classes, "Balanced Accuracy") confmat_fig_path = os.path.join(outdir, 'confusion_matrix') visualize.confusion_matrices(confusion_matrix, class_order, method_names, confmat_fig_path) cmp_misclf_fig_path = os.path.join(outdir, 'compare_misclf_rates') if num_classes > 2: visualize.compare_misclf_pairwise(confusion_matrix, class_order, method_names, cmp_misclf_fig_path) elif num_classes == 2: visualize.compare_misclf_pairwise_parallel_coord_plot( confusion_matrix, class_order, method_names, cmp_misclf_fig_path) featimp_fig_path = os.path.join(outdir, 'feature_importance') visualize.feature_importance_map(feature_importances_rf, method_names, featimp_fig_path, feature_names) misclf_out_path = os.path.join(outdir, 'misclassified_subjects') visualize.freq_hist_misclassifications(num_times_misclfd, num_times_tested, method_names, misclf_out_path) except: traceback.print_exc() warnings.warn('Error generating the visualizations! Skipping ..')
def export_results(results_file_path, outdir): """ Exports the results to simpler CSV format for use in other packages! Parameters ---------- results_file_path outdir Returns ------- None """ dataset_paths, method_names, train_perc, num_repetitions, num_classes, \ pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \ best_min_leaf_size, best_num_predictors, \ feature_importances_rf, feature_names, \ num_times_misclfd, num_times_tested, \ confusion_matrix, class_order, accuracy_balanced, auc_weighted, positive_class = \ rhst.load_results(results_file_path) num_classes = confusion_matrix.shape[0] num_rep_cv = confusion_matrix.shape[2] num_datasets = confusion_matrix.shape[3] # separating CSVs from the PDFs exp_dir = os.path.join(outdir, cfg.EXPORT_DIR_NAME) if not os.path.exists(exp_dir): os.mkdir(exp_dir) # TODO think about how to export predictive probability per class per CV rep # pred_prob_per_class try: balacc_path = os.path.join(exp_dir, 'balanced_accuracy.csv') np.savetxt(balacc_path, accuracy_balanced, delimiter=cfg.DELIMITER, fmt=cfg.EXPORT_FORMAT, header=','.join(method_names)) cfmat_reshaped = np.reshape( confusion_matrix, [num_classes * num_classes, num_rep_cv, num_datasets]) for mm in range(num_datasets): confmat_path = os.path.join( exp_dir, 'confusion_matrix_{}.csv'.format(method_names[mm])) np.savetxt( confmat_path, cfmat_reshaped[:, :, mm].T, # NOTICE the transpose delimiter=cfg.DELIMITER, fmt=cfg.EXPORT_FORMAT, comments= 'shape of confusion matrix: num_repetitions x num_classes^2') avg_cfmat, misclf_rate = visualize.compute_pairwise_misclf( confusion_matrix) num_datasets = misclf_rate.shape[0] for mm in range(num_datasets): cmp_misclf_path = os.path.join( exp_dir, 'average_misclassification_rates_{}.csv'.format( method_names[mm])) np.savetxt(cmp_misclf_path, misclf_rate[mm, :], fmt=cfg.EXPORT_FORMAT, delimiter=cfg.DELIMITER) for mm in range(num_datasets): featimp_path = os.path.join( exp_dir, 'feature_importance_{}.csv'.format(method_names[mm])) np.savetxt(featimp_path, feature_importances_rf[mm], fmt=cfg.EXPORT_FORMAT, delimiter=cfg.DELIMITER, header=','.join(feature_names[mm])) perc_misclsfd, _, _, _ = visualize.compute_perc_misclf_per_sample( num_times_misclfd, num_times_tested) for mm in range(num_datasets): subwise_misclf_path = os.path.join( exp_dir, 'subject_misclf_freq_{}.csv'.format(method_names[mm])) # TODO there must be a more elegant way to write dict to CSV with open(subwise_misclf_path, 'w') as smf: for sid, val in perc_misclsfd[mm].items(): smf.write('{}{}{}\n'.format(sid, cfg.DELIMITER, val)) except: traceback.print_exc() raise IOError('Unable to export the results to CSV files.') return
def test_chance_classifier_binary(): rand_ds = make_random_MLdataset(max_num_classes=3, stratified=True, max_class_size=100, max_dim=100) out_path = os.path.join(out_dir, 'two_classes_random_features.pkl') rand_two_class = rand_ds.get_class(rand_ds.class_set[0:2]) rand_two_class.save(out_path) out_list = os.path.join(out_dir, 'same_data_two_classes_list_datasets.txt') with open(out_list, 'w') as lf: lf.writelines('\n'.join([ out_path, ])) res_path = rhst.run(out_list, ['random'], out_dir, train_perc=0.5, num_repetitions=50) dataset_paths, method_names, train_perc, num_repetitions, num_classes, \ pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \ best_min_leaf_size, best_num_predictors, \ feature_importances_rf, feature_names, \ num_times_misclfd, num_times_tested, \ confusion_matrix, class_set, accuracy_balanced, auc_weighted, positive_class = rhst.load_results(res_path) # TODO replace hard coded chance accuracy calculation with programmatic based on class sample sizes # assert np.median(accuracy_balanced) == np.median(rhst.chance_accuracy(class_sizes)) if abs(np.median(accuracy_balanced) - 0.5) > 0.05: raise ValueError( 'Accuracy to discriminate between two inseparable classes significantly differs from 0.5' ) if abs(np.median(auc_weighted) - 0.5) > 0.05: raise ValueError( 'AUC to discriminate between two inseparable classes significantly differs from 0.5' )