示例#1
0
def visualize_results(results_file_path, outdir, method_names):
    """
    Produces the performance visualizations/comparisons from the cross-validation results.
    
    Parameters
    ----------
    results_file_path
    outdir
    method_names

    """

    dataset_paths, method_names, train_perc, num_repetitions, num_classes, \
        pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \
        best_min_leaf_size, best_num_predictors, \
        feature_importances_rf, feature_names, \
        num_times_misclfd, num_times_tested, \
        confusion_matrix, class_order, accuracy_balanced, auc_weighted, positive_class = \
            rhst.load_results(results_file_path)

    if os.environ['DISPLAY'] is None:
        warnings.warn(
            'DISPLAY is not set. Skipping to generate any visualizations.')
        return

    try:

        balacc_fig_path = os.path.join(outdir, 'balanced_accuracy')
        visualize.metric_distribution(accuracy_balanced, method_names,
                                      balacc_fig_path, num_classes,
                                      "Balanced Accuracy")

        confmat_fig_path = os.path.join(outdir, 'confusion_matrix')
        visualize.confusion_matrices(confusion_matrix, class_order,
                                     method_names, confmat_fig_path)

        cmp_misclf_fig_path = os.path.join(outdir, 'compare_misclf_rates')
        if num_classes > 2:
            visualize.compare_misclf_pairwise(confusion_matrix, class_order,
                                              method_names,
                                              cmp_misclf_fig_path)
        elif num_classes == 2:
            visualize.compare_misclf_pairwise_parallel_coord_plot(
                confusion_matrix, class_order, method_names,
                cmp_misclf_fig_path)

        featimp_fig_path = os.path.join(outdir, 'feature_importance')
        visualize.feature_importance_map(feature_importances_rf, method_names,
                                         featimp_fig_path, feature_names)

        misclf_out_path = os.path.join(outdir, 'misclassified_subjects')
        visualize.freq_hist_misclassifications(num_times_misclfd,
                                               num_times_tested, method_names,
                                               misclf_out_path)
    except:
        traceback.print_exc()
        warnings.warn('Error generating the visualizations! Skipping ..')
示例#2
0
def export_results(results_file_path, outdir):
    """
    Exports the results to simpler CSV format for use in other packages!
    
    Parameters
    ----------
    results_file_path
    outdir

    Returns
    -------
    None
    
    """

    dataset_paths, method_names, train_perc, num_repetitions, num_classes, \
        pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \
        best_min_leaf_size, best_num_predictors, \
        feature_importances_rf, feature_names, \
        num_times_misclfd, num_times_tested, \
        confusion_matrix, class_order, accuracy_balanced, auc_weighted, positive_class = \
            rhst.load_results(results_file_path)

    num_classes = confusion_matrix.shape[0]
    num_rep_cv = confusion_matrix.shape[2]
    num_datasets = confusion_matrix.shape[3]

    # separating CSVs from the PDFs
    exp_dir = os.path.join(outdir, cfg.EXPORT_DIR_NAME)
    if not os.path.exists(exp_dir):
        os.mkdir(exp_dir)

    # TODO think about how to export predictive probability per class per CV rep
    # pred_prob_per_class

    try:
        balacc_path = os.path.join(exp_dir, 'balanced_accuracy.csv')
        np.savetxt(balacc_path,
                   accuracy_balanced,
                   delimiter=cfg.DELIMITER,
                   fmt=cfg.EXPORT_FORMAT,
                   header=','.join(method_names))

        cfmat_reshaped = np.reshape(
            confusion_matrix,
            [num_classes * num_classes, num_rep_cv, num_datasets])
        for mm in range(num_datasets):
            confmat_path = os.path.join(
                exp_dir, 'confusion_matrix_{}.csv'.format(method_names[mm]))
            np.savetxt(
                confmat_path,
                cfmat_reshaped[:, :, mm].T,  # NOTICE the transpose
                delimiter=cfg.DELIMITER,
                fmt=cfg.EXPORT_FORMAT,
                comments=
                'shape of confusion matrix: num_repetitions x num_classes^2')

        avg_cfmat, misclf_rate = visualize.compute_pairwise_misclf(
            confusion_matrix)
        num_datasets = misclf_rate.shape[0]
        for mm in range(num_datasets):
            cmp_misclf_path = os.path.join(
                exp_dir, 'average_misclassification_rates_{}.csv'.format(
                    method_names[mm]))
            np.savetxt(cmp_misclf_path,
                       misclf_rate[mm, :],
                       fmt=cfg.EXPORT_FORMAT,
                       delimiter=cfg.DELIMITER)

        for mm in range(num_datasets):
            featimp_path = os.path.join(
                exp_dir, 'feature_importance_{}.csv'.format(method_names[mm]))
            np.savetxt(featimp_path,
                       feature_importances_rf[mm],
                       fmt=cfg.EXPORT_FORMAT,
                       delimiter=cfg.DELIMITER,
                       header=','.join(feature_names[mm]))

        perc_misclsfd, _, _, _ = visualize.compute_perc_misclf_per_sample(
            num_times_misclfd, num_times_tested)
        for mm in range(num_datasets):
            subwise_misclf_path = os.path.join(
                exp_dir, 'subject_misclf_freq_{}.csv'.format(method_names[mm]))
            # TODO there must be a more elegant way to write dict to CSV
            with open(subwise_misclf_path, 'w') as smf:
                for sid, val in perc_misclsfd[mm].items():
                    smf.write('{}{}{}\n'.format(sid, cfg.DELIMITER, val))

    except:
        traceback.print_exc()
        raise IOError('Unable to export the results to CSV files.')

    return
示例#3
0
def test_chance_classifier_binary():

    rand_ds = make_random_MLdataset(max_num_classes=3,
                                    stratified=True,
                                    max_class_size=100,
                                    max_dim=100)

    out_path = os.path.join(out_dir, 'two_classes_random_features.pkl')
    rand_two_class = rand_ds.get_class(rand_ds.class_set[0:2])
    rand_two_class.save(out_path)

    out_list = os.path.join(out_dir, 'same_data_two_classes_list_datasets.txt')
    with open(out_list, 'w') as lf:
        lf.writelines('\n'.join([
            out_path,
        ]))

    res_path = rhst.run(out_list, ['random'],
                        out_dir,
                        train_perc=0.5,
                        num_repetitions=50)

    dataset_paths, method_names, train_perc, num_repetitions, num_classes, \
        pred_prob_per_class, pred_labels_per_rep_fs, test_labels_per_rep, \
        best_min_leaf_size, best_num_predictors, \
        feature_importances_rf, feature_names, \
        num_times_misclfd, num_times_tested, \
        confusion_matrix, class_set, accuracy_balanced, auc_weighted, positive_class = rhst.load_results(res_path)

    # TODO replace hard coded chance accuracy calculation with programmatic based on class sample sizes
    # assert np.median(accuracy_balanced) == np.median(rhst.chance_accuracy(class_sizes))
    if abs(np.median(accuracy_balanced) - 0.5) > 0.05:
        raise ValueError(
            'Accuracy to discriminate between two inseparable classes significantly differs from 0.5'
        )

    if abs(np.median(auc_weighted) - 0.5) > 0.05:
        raise ValueError(
            'AUC to discriminate between two inseparable classes significantly differs from 0.5'
        )