def plot_histograms(within_batch_corrs, between_batch_corrs, ncomps, batch_column, hist_folder, verbosity):

    filename = get_batch_effect_histogram_filename(hist_folder, ncomps)

    hist_bins = np.linspace(-1, 1, 41)
    bin_centers = 0.5 * (hist_bins[1:] + hist_bins[:-1])
    # Due to a change in numpy, I must add "range=(bins.min(),bins.max())"
    # https://github.com/numpy/numpy/issues/7503
    y_within_batch, bins = np.histogram(within_batch_corrs, bins = hist_bins, range=(hist_bins.min(), hist_bins.max()), density = True)
    y_between_batch, bins = np.histogram(between_batch_corrs, bins = hist_bins, range=(hist_bins.min(), hist_bins.max()), density = True)

    mean_within_batch_cor = np.nanmean(within_batch_corrs)
    mean_between_batch_cor = np.nanmean(between_batch_corrs)

    colors = pt.color_cycle()
    fig = plt.figure(figsize = (7, 7))
    ax = fig.add_subplot(1,1,1)
    color = colors.next()
    ax.plot(bin_centers, y_within_batch, color + '-', label = 'Within batch (mean = {:.2f})'.format(mean_within_batch_cor))
    color = colors.next()
    ax.plot(bin_centers, y_between_batch, color + '-', label = 'Between batch (mean = {:.2f})'.format(mean_between_batch_cor))
    ax.set_xlim([-1, 1])
    ax.set_xlabel('Average correlation within/between batch(es)')
    ax.set_ylabel('Density')
    lgd = ax.legend(bbox_to_anchor = (1.05, 1), loc = 2, title = "Correlation origin")
    plt.title('{} batch correlations,\n{} components removed'.format(batch_column, ncomps))
    plt.savefig(filename, bbox_extra_artists = ([lgd]), bbox_inches = 'tight')

    # Here I compute the t-statistic for each distribution
    # where the null hypothesis is that the mean is 0.
    # Also, take the sample standard deviation, not the population
    # standard deviation (ddof = 1).
    #if verbosity >= 2:
    #    print 'within batch t-statistic calculation:'
    #    print '{} / ({} / sqrt({}))'.format(mean_within_batch_cor, np.nanstd(within_batch_corrs, ddof = 1), np.size(within_batch_corrs))
    #    print ''
    #    print 'between batch t-statistic calculation:'
    #    print '{} / ({} / sqrt({}))'.format(mean_between_batch_cor, np.nanstd(between_batch_corrs, ddof = 1), np.size(between_batch_corrs))

    #within_batch_t = mean_within_batch_cor / (np.nanstd(within_batch_corrs, ddof = 1) / np.sqrt(np.size(within_batch_corrs)))
    #between_batch_t = mean_between_batch_cor / (np.nanstd(between_batch_corrs, ddof = 1) / np.sqrt(np.size(between_batch_corrs)))

    ## Take the t-stats and convert to pvalues.
    #within_batch_p = stats.t.sf(np.abs(within_batch_t), np.size(within_batch_corrs) - 1) * 2
    #between_batch_p = stats.t.sf(np.abs(between_batch_t), np.size(between_batch_corrs) - 1) * 2

    ## Return t-statstics and p-values
    #return [np.array([[within_batch_t, between_batch_t]]), np.array([[within_batch_p, between_batch_p]])]

    # Compute a Mann-Whitney U statistic and p-value comparing the distributions
    # of within-batch and between-batch correlations (Mann-Whitney U is equivalent
    # to the AUC for a ROC curve comparing the two classes.
    within_batch_corrs_nanfree = within_batch_corrs[np.invert(np.isnan(within_batch_corrs))]
    between_batch_corrs_nanfree = between_batch_corrs[np.invert(np.isnan(between_batch_corrs))]
    MW_U, MW_p = stats.mannwhitneyu(within_batch_corrs_nanfree, between_batch_corrs_nanfree)
    if verbosity >= 2 :
        print within_batch_corrs_nanfree
        print between_batch_corrs_nanfree
    MW_AUC = MW_U / (np.size(within_batch_corrs_nanfree) * np.size(between_batch_corrs_nanfree))
    return MW_AUC, MW_p
예제 #2
0
def plot_t_stats(comps_removed, t_stats, batch_column, hist_folder):

    # This function plots the change in t-statistic for both
    # within- and between-batch correlations, as components
    # (either LDA or SVD) are removed.
    filename = get_batch_effect_histogram_ttest_filename(hist_folder) 
   
    colors = pt.color_cycle()
    fig = plt.figure(figsize = (7, 7))
    ax = fig.add_subplot(1,1,1)
    color = colors.next()
    ax.plot(comps_removed, t_stats[:, 0], color + '-', label = 'Within batch')
    color = colors.next()
    ax.plot(comps_removed, t_stats[:, 1], color + '-', label = 'Between batch')
    ax.set_xlabel('Number of components removed')
    ax.set_ylabel(r'$t$-statistic' '\n' r'($H_0: \bar \mu = 0$)')
    lgd = ax.legend(bbox_to_anchor = (1.05, 1), loc = 2, title = 'Correlation origin')
    plt.title(r'$t$-statistics of {}' '\nbatch correlation distributions'.format(batch_column))
    plt.savefig(filename, bbox_extra_artists = ([lgd]), bbox_inches = 'tight')