Пример #1
0
def interaction_empirical_p_resample(a, b, surv, num_perm=101, check_first=True):
    '''
    Calculate an empirical p-value for an interaction by sampling
    with replacement.  
    
    We first test if there is an improvement in model fit by 
    considering the interaction of the two events.  If so, we 
    then derive an empirical p-value. 
    '''
    a, b = match_series(a, b)
    if fisher_exact_test(a, b)['odds_ratio'] > 1:
        int_direction = 'both'
    else:
        int_direction = 'neither'
    r = get_interaction(a, b, surv)
    if (r < 0) and (check_first is True):
        return pd.Series({'p': 1, 'interaction': int_direction})
    
    mat = np.random.choice(a.index, size=(num_perm, len(a.index)))
    
    vec = {}
    for i, idx in enumerate(mat):
        a_p = pd.Series(list(a.ix[idx]), range(len(idx)))
        b_p = pd.Series(list(b.ix[idx]), range(len(idx)))
        surv_p = pd.DataFrame(surv.unstack().ix[a.index].as_matrix(),
                              index=range(len(idx)),
                              columns=['days', 'event']).stack()
        vec[i] = get_interaction(a_p, b_p, surv_p, int_direction)
    vec = pd.Series(vec)
    
    empirical_p = 1.*(len(vec) - sum(vec <= r)) / len(vec)
    return pd.Series({'p': empirical_p, 'interaction': int_direction})
Пример #2
0
def box_plot_pandas(bin_vec, real_vec, ax=None):
    """
    Wrapper around matplotlib's boxplot function.
    
    Inputs
        bin_vec: Series of labels
        real_vec: Series of measurements to be grouped according to bin_vec
    """
    _, ax = init_ax(ax)
    bin_vec, real_vec = match_series(bin_vec, real_vec)
    categories = bin_vec.value_counts().index
    data = [real_vec[bin_vec == num] for num in categories]
    bp = ax.boxplot(data, positions=range(len(categories)), widths=.3,
                    patch_artist=True)
    if real_vec.name:
        ax.set_ylabel(real_vec.name)
    if bin_vec.name:
        ax.set_xlabel(bin_vec.name)
    [p.set_visible(False) for p in bp['fliers']]
    [p.set_visible(False) for p in bp['caps']]
    [p.set_visible(False) for p in bp['whiskers']]
    for p in bp['medians']:
        p.set_color(colors[0])
        p.set_lw(3)
        p.set_alpha(.8)
    for i, p in enumerate(bp['boxes']):
        p.set_color('grey')
        p.set_lw(3)
        p.set_alpha(.7)
        if len(data[i]) < 3:
            p.set_alpha(0)
Пример #3
0
def box_plot_pandas(bin_vec, real_vec, ax=None):
    """
    Wrapper around matplotlib's boxplot function.
    
    Inputs
        bin_vec: Series of labels
        real_vec: Series of measurements to be grouped according to bin_vec
    """
    _, ax = init_ax(ax)
    bin_vec, real_vec = match_series(bin_vec, real_vec)
    categories = bin_vec.value_counts().index
    data = [real_vec[bin_vec == num] for num in categories]
    bp = ax.boxplot(data,
                    positions=range(len(categories)),
                    widths=.3,
                    patch_artist=True)
    if real_vec.name:
        ax.set_ylabel(real_vec.name)
    if bin_vec.name:
        ax.set_xlabel(bin_vec.name)
    [p.set_visible(False) for p in bp['fliers']]
    [p.set_visible(False) for p in bp['caps']]
    [p.set_visible(False) for p in bp['whiskers']]
    for p in bp['medians']:
        p.set_color(colors[0])
        p.set_lw(3)
        p.set_alpha(.8)
    for i, p in enumerate(bp['boxes']):
        p.set_color('grey')
        p.set_lw(3)
        p.set_alpha(.7)
        if len(data[i]) < 3:
            p.set_alpha(0)
Пример #4
0
def interaction_empirical_p(a, b, surv, num_perm=101):
    '''
    Calculate an empirical p-value for an interaction by sampling
    with replacement.  
    
    We first test if there is an improvement in model fit by 
    considering the interaction of the two events.  If so, we 
    then derive an empirical p-value. 
    '''
    a, b = match_series(a, b)
    if fisher_exact_test(a, b)['odds_ratio'] > 1:
        int_direction = 'both'
    else:
        int_direction = 'neither'
    r = get_interaction(a, b, surv)
    mat = np.array([np.random.permutation(a.index) for i in range(num_perm)])

    vec = {}
    for i, idx in enumerate(mat):
        a_p = pd.Series(list(a.ix[idx]), range(len(idx)))
        b_p = pd.Series(list(b.ix[idx]), range(len(idx)))
        surv_p = pd.DataFrame(surv.unstack().ix[a.index].as_matrix(),
                              index=range(len(idx)),
                              columns=['days', 'event']).stack()
        vec[i] = get_interaction(a_p, b_p, surv_p, int_direction)
    vec = pd.Series(vec).dropna()
    empirical_p = 1. * (len(vec) - sum(vec <= r)) / len(vec)
    return pd.Series({'p': empirical_p, 'interaction': int_direction})
Пример #5
0
def pearson_p(a,b):
    '''
    Find pearson's correlation and return p-value.
    ------------------------------------------------
    a, b: Series with continuous measurements
    '''
    a,b = match_series(a.dropna(), b.dropna())
    _,p = pearsonr(a,b)
    return p
Пример #6
0
def violin_plot_pandas(bin_vec,
                       real_vec,
                       ann='p',
                       order=None,
                       ax=None,
                       filename=None):
    """
    http://pyinsci.blogspot.com/2009/09/violin-plot-with-matplotlib.html
    Wrapper around matplotlib's boxplot function to add violin profile.
    
    Inputs
        bin_vec: Series of labels
        real_vec: Series of measurements to be grouped according to bin_vec
    """
    fig, ax = init_ax(ax)
    ax.set_ylabel(real_vec.name)
    ax.set_xlabel(bin_vec.name)
    bin_vec, real_vec = match_series(bin_vec, real_vec)
    try:
        if order is None:
            categories = bin_vec.value_counts().index
        else:
            categories = order
        _violin_plot(ax, [real_vec[bin_vec == num] for num in categories],
                     pos=categories,
                     bp=True)
        ax.set_xticklabels(
            [str(c) + '\n(n=%i)' % sum(bin_vec == c) for c in categories])
    except:
        box_plot_pandas(bin_vec, real_vec, ax=ax)

    #if type(bin_vec.name) == str:
    #    ax.set_title(str(bin_vec.name) + ' x ' + str(real_vec.name))

    p_value = Stats.kruskal_pandas(bin_vec, real_vec)['p']
    if ann == 'p_fancy':
        ax.annotate('$p = {}$'.format(latex_float(p_value)), (.95, -.02),
                    xycoords='axes fraction',
                    ha='right',
                    va='bottom',
                    size=14)
    if ann == 'p':
        ax.annotate('p = {0:.1e}'.format(p_value), (.95, .02),
                    xycoords='axes fraction',
                    ha='right',
                    va='bottom',
                    size=12)
    elif ann is not None:
        ax.annotate(ann, (.95, .02),
                    xycoords='axes fraction',
                    ha='right',
                    va='bottom',
                    size=12)
    if filename is not None:
        fig.savefig(filename)
    return
Пример #7
0
def kruskal_p(hit_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    hit_vec: Series of labels
    response_vec: Series of measurements
    '''
    try:
        hit_vec, response_vec = match_series(hit_vec, response_vec)
        return kruskal(*[response_vec[hit_vec == num] for num in 
                          hit_vec.unique()])[1]
    except:
        return nan
Пример #8
0
def bartlett_pandas(group_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    group_vec: Series of labels
    response_vec: Series of measurements
    '''
    if group_vec.value_counts().min() < min_size:
        return nan
    group_vec, response_vec = match_series(group_vec, response_vec)
    res = bartlett(*[response_vec[group_vec == num] for num in 
                     group_vec.unique()])
    return pd.Series(res, index=['T','p'])
Пример #9
0
def pearson_pandas(a, b, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    hit_vec: Series of labels
    response_vec: Series of measurements
    '''
    try:
        a, b = match_series(a, b)
        res = stats.pearsonr(a,b)
        return pd.Series(res, index=['rho','p'])
    except:
        return pd.Series(index=['rho','p'])
Пример #10
0
def anova(hit_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    hit_vec: Series of labels
    response_vec: Series of measurements
    '''
    if hit_vec.value_counts().min < min_size:
        return nan
    hit_vec, response_vec = match_series(hit_vec, response_vec)
    res = f_oneway(*[response_vec[hit_vec == num] for num in 
                     hit_vec.unique()])
    return pd.Series(res, index=['F','p'])
Пример #11
0
def kruskal_pandas(hit_vec, response_vec, min_size=5):
    '''
    Wrapper to do a one way anova on pandas Series
    ------------------------------------------------
    hit_vec: Series of labels
    response_vec: Series of measurements
    '''
    try:
        hit_vec, response_vec = match_series(hit_vec, response_vec)
        res = kruskal(*[response_vec[hit_vec == num] for num in 
                          hit_vec.unique()])
        return pd.Series(res, index=['H','p'])
    except:
        return pd.Series(index=['H','p'])
Пример #12
0
def single_gene_section(cancer, hit_matrix, cutoff=.25):
    #Format data for report
    path = cancer.report_folder + '/'
    gene_table_file = path + 'gene_table.csv'
    hit_matrix = hit_matrix.groupby(level=0).first() #Make index unique
    counts = (hit_matrix.ix[:,cancer.patients] > 0).sum(1)
    counts.name = 'n_patients'
    genes = Series(dict((i,i) for i in cancer.q_genes.index), name='gene')
    gene_table = cancer.q_genes.join(counts).join(genes)
    gene_table = gene_table.ix[:,::-1]
    if 'survival' in gene_table:
        gene_table = gene_table.sort(columns='survival')
    gene_table.to_csv(gene_table_file)
    genes_to_show = cancer.q_genes[(cancer.q_genes < .2).sum(1) > 0].index
    gene_table = gene_table.ix[genes_to_show]
    if 'survival' in gene_table:
        gene_table = gene_table.sort(columns='survival')
    gene_table = gene_table.head(20)
    gene_table_r = com.convert_to_r_dataframe(gene_table) #@UndefinedVariable
    
    if len(gene_table) == 0:
        return nz.addTo(nz.newSubSection('Gene Mutations'), nz.newParagraph(''))
    
    #Overview
    tableCaption1 = "Association of gene mutations with patient clinical features."
    table1 = nz.newTable(gene_table_r, tableCaption1, file=gene_table_file, 
                         significantDigits=2);
    #Fill in the details
    gene_pos = dict((g,i+1) for i,g in enumerate(gene_table.index))
    col_pos = dict((c,i+1) for i,c in enumerate(gene_table.columns))
    
    #age violin plots
    if 'age' in gene_table:
        for g,val in gene_table['age'].iteritems():
            num_genes = (match_series(hit_matrix.ix[g], cancer.clinical.age)[0] > 0).sum()
            if val < cutoff and num_genes > 2:
                table1 = add_violin_plot(hit_matrix.ix[g], cancer, table1, 
                                         (gene_pos[g], col_pos['age']),
                                         path + FIG_EXT)
        
    #survival curves
    if 'survival' in gene_table:
        for g,val in gene_table['survival'].iteritems():
            if val < cutoff:
                table1 = add_survival_curve(hit_matrix.ix[g], cancer, table1, (gene_pos[g], 
                                            col_pos['survival']), path + FIG_EXT) 
    
    section = nz.addTo(nz.newSubSection('Gene Mutations'), table1)
    return section
Пример #13
0
def series_scatter(s1, s2, ax=None, ann='p', filename=None, **plot_args):
    fig, ax = init_ax(ax, figsize=(6,4))
    if 's' not in plot_args:
        plot_args['s'] = 75
    if 'alpha' not in plot_args:
        plot_args['alpha'] = .5
    ax.scatter(*match_series(s1, s2), **plot_args)
    ax.set_xlabel(s1.name)
    ax.set_ylabel(s2.name)
    if ann == 'p':
        ax.annotate('p = {0:.1e}'.format(Tests.spearman_pandas(s1, s2)['p']), (.95, -.02),
                    xycoords='axes fraction', ha='right',va='bottom', size=14)
    if ann == 'fancy_p':
        ax.annotate('$p = {}$'.format(latex_float(Tests.spearman_pandas(s1, s2)['p'])), (.95, -.02),
                    xycoords='axes fraction', ha='right',va='bottom', size=14)
    if filename is not None:
        fig.savefig(filename)
Пример #14
0
def series_scatter(s1, s2, ax=None, ann='p', filename=None, **plot_args):
    fig, ax = init_ax(ax, figsize=(6, 4))
    if 's' not in plot_args:
        plot_args['s'] = 75
    if 'alpha' not in plot_args:
        plot_args['alpha'] = .5
    ax.scatter(*match_series(s1, s2), **plot_args)
    ax.set_xlabel(s1.name)
    ax.set_ylabel(s2.name)
    if ann == 'p':
        ax.annotate('p = {0:.1e}'.format(Tests.spearman_pandas(s1, s2)['p']), (.95, -.02),
                    xycoords='axes fraction', ha='right', va='bottom', size=14)
    if ann == 'fancy_p':
        ax.annotate('$p = {}$'.format(latex_float(Tests.spearman_pandas(s1, s2)['p'])), (.95, -.02),
                    xycoords='axes fraction', ha='right', va='bottom', size=14)
    if filename is not None:
        fig.savefig(filename)
Пример #15
0
def pathway_mutation_section(cancer, gene_sets, cutoff=.25):
    #Format data for report
    path = cancer.report_folder + '/'
    pathway_table_file = path + 'pathway_table.csv'
    pathway_table = format_pathway_table(cancer, gene_sets)    
    if 'survival' in pathway_table:
        pathway_table = pathway_table.sort(columns='survival')
    pathway_table.to_csv(pathway_table_file)
    keepers = cancer.q_pathways[(cancer.q_pathways < .25).sum(1) > 0].index
    pathway_table = pathway_table.ix[keepers]
    if 'survival' in pathway_table:
        pathway_table = pathway_table.sort(columns='survival')
    pathway_table = pathway_table.head(20)
    pathway_table_r = com.convert_to_r_dataframe(pathway_table.replace(nan, 1.23)) #@UndefinedVariable
    if len(pathway_table) == 0:
        return nz.addTo(nz.newSubSection('Pathway Mutations'), nz.newParagraph(''))
    
    #Overview
    tableCaption1 = ('Association of pathway level mutations with patient' + 
                     'clinical features.')
    table1 = nz.newTable(pathway_table_r, tableCaption1, file=pathway_table_file, 
                             significantDigits=2);                      
   
    #Fill in the details
    pathway_pos = dict((p,i+1) for i,p in enumerate(pathway_table.index))
    col_pos = dict((c,i+1) for i,c in enumerate(pathway_table.columns))
    
    #age violin plots
    if 'age' in pathway_table:
        for g,val in pathway_table['age'].iteritems():
            num_patients = (match_series(cancer.meta_matrix.ix[g], cancer.clinical.age)[0] > 0).sum()
            if val < cutoff and num_patients > 2:
                table1 = add_violin_plot(cancer.meta_matrix.ix[g], cancer, table1, 
                                         (pathway_pos[g], col_pos['age']),
                                         path + FIG_EXT)        
    
    #survival curves
    if 'survival' in pathway_table:
        for g,val in pathway_table['survival'].iteritems():
            if val < cutoff:
                table1 = add_survival_curve_pathway(cancer.meta_matrix.ix[g], cancer, table1, 
                            (pathway_pos[g], col_pos['survival']), path + FIG_EXT) 
                
    section = nz.addTo(nz.newSubSection('Pathway Mutations'), table1)
    return section
Пример #16
0
def violin_plot_pandas(bin_vec, real_vec, ann='p', order=None, ax=None,
                       filename=None):
    """
    http://pyinsci.blogspot.com/2009/09/violin-plot-with-matplotlib.html
    Wrapper around matplotlib's boxplot function to add violin profile.
    
    Inputs
        bin_vec: Series of labels
        real_vec: Series of measurements to be grouped according to bin_vec
    """
    fig, ax = init_ax(ax)
    ax.set_ylabel(real_vec.name)
    ax.set_xlabel(bin_vec.name)
    bin_vec, real_vec = match_series(bin_vec, real_vec)
    try:
        if order is None:
            categories = bin_vec.value_counts().index
        else:
            categories = order
        _violin_plot(ax, [real_vec[bin_vec == num] for num in categories],
                     pos=categories, bp=True)
        ax.set_xticklabels([str(c) + '\n(n=%i)' % sum(bin_vec == c) 
                            for c in categories])
    except:
        box_plot_pandas(bin_vec, real_vec, ax=ax)
        
    #if type(bin_vec.name) == str:
    #    ax.set_title(str(bin_vec.name) + ' x ' + str(real_vec.name))
        
    p_value = Stats.kruskal_pandas(bin_vec, real_vec)['p']
    if ann == 'p_fancy':
        ax.annotate('$p = {}$'.format(latex_float(p_value)), (.95, -.02),
                    xycoords='axes fraction', ha='right', va='bottom', size=14)
    if ann == 'p':
        ax.annotate('p = {0:.1e}'.format(p_value), (.95, .02),
                    xycoords='axes fraction', ha='right', va='bottom', size=12)
    elif ann is not None:
        ax.annotate(ann, (.95, .02), xycoords='axes fraction', ha='right',
                    va='bottom', size=12)
    if filename is not None:
        fig.savefig(filename)
    return
Пример #17
0
 def test(hit_vec):
     hit_vec, response_vec = match_series(hit_vec, self.response_vec)
     res =  f_oneway(*[response_vec[hit_vec == num] for num in 
               hit_vec.unique()])
     return Series({'stat': res[0], 'p': res[1]})