Exemplo n.º 1
0
def cterm_dimotif_enrichment(df, thresh):
    aa_vocab = degron_pred.vocab
    output_list = []
    base_str = 'xxxxxx'
    for i in range(1, 6):
        for j in range(i + 1, 7):
            for aa1 in aa_vocab:
                for aa2 in aa_vocab:
                    tmp = list(base_str)
                    tmp[-i] = aa1
                    tmp[-j] = aa2
                    pattern = ''.join(tmp)
                    testing = df['Peptide amino acid sequence'].apply(
                        motif_count, args=([pattern], 'cterm'))
                    top_ct = testing.iloc[:thresh].sum()
                    #bottom_ct = testing.iloc[thresh:-thresh].sum() + 1
                    bottom_ct = testing.sum()
                    top_n = thresh
                    #bot_n = len(testing)-thresh*2+1
                    bot_n = len(testing)
                    pval = stats.binom_test(top_ct,
                                            n=top_n,
                                            p=bottom_ct / bot_n,
                                            alternative='greater')
                    #output_list.append([pattern, i, j, aa1, aa2, top_ct, bottom_ct, thresh, len(testing)-2*thresh, pval])
                    output_list.append([
                        pattern, i, j, aa1, aa2, top_ct, bottom_ct, thresh,
                        len(testing), pval
                    ])

    # compile results
    mycols = [
        'motif', 'pos1', 'pos2', 'aa1', 'aa2', 'top ct', 'bot ct', 'top total',
        'bot total', 'pvalue'
    ]
    result = pd.DataFrame(output_list, columns=mycols)
    result['log(OR)'] = np.log2(
        (result['top ct'] / (result['top total'] - result['top ct'])) /
        (result['bot ct'] / (result['bot total'] - result['bot ct'])))
    result['adjusted pvalue'] = result['pvalue'] * len(result)
    result['qvalue'] = pvalue.bh_fdr(result['pvalue'])
    result.sort_values('pvalue', inplace=True)

    return result
Exemplo n.º 2
0
def cterm_quadmotif_enrichment(df, thresh):
    aa_vocab = degron_pred.vocab
    # get background probs for each aa
    aa_bg = {}
    #bot_n = len(df)-thresh*2+1
    bot_n = len(df)
    for aa in aa_vocab:
        tmp = df['Peptide amino acid sequence'].apply(motif_count_nopos,
                                                      args=([aa], 'cterm'))
        #tmp_prob = (tmp.iloc[thresh:-thresh].sum() + 1) / (bot_n*6)
        tmp_prob = (tmp.sum()) / (bot_n * 6)
        aa_bg[aa] = tmp_prob

    output_list = []
    for aa1 in aa_vocab:
        for aa2 in aa_vocab:
            for aa3 in aa_vocab:
                for aa4 in aa_vocab:
                    pattern = ''.join([aa1, aa2, aa3, aa4])
                    # figure out full count
                    testing = df['Peptide amino acid sequence'].apply(
                        motif_count_nopos, args=([pattern], 'cterm'))
                    top_ct = testing.iloc[:thresh].sum()
                    top_n = thresh * 3
                    # get baseline prob
                    bottom_prob_1 = aa_bg[pattern[0]]
                    tmp = df['Peptide amino acid sequence'].apply(
                        motif_count_nopos, args=([pattern[:2]], 'cterm'))
                    bottom_prob_2_di = (tmp.sum()) / (bot_n * 5)
                    bottom_prob_2 = bottom_prob_2_di / bottom_prob_1
                    bottom_prob_2_aa = aa_bg[pattern[1]]
                    tmp = df['Peptide amino acid sequence'].apply(
                        motif_count_nopos, args=([pattern[1:3]], 'cterm'))
                    bottom_prob_3_di = (tmp.sum()) / (bot_n * 5)
                    bottom_prob_3 = bottom_prob_3_di / bottom_prob_2_aa
                    bottom_prob_3_aa = aa_bg[pattern[2]]
                    tmp = df['Peptide amino acid sequence'].apply(
                        motif_count_nopos, args=([pattern[2:4]], 'cterm'))
                    bottom_prob_4_di = (tmp.sum()) / (bot_n * 5)
                    bottom_prob_4 = bottom_prob_4_di / bottom_prob_3_aa

                    # calc background p
                    bottom_prob = bottom_prob_1 * bottom_prob_2 * bottom_prob_3 * bottom_prob_4

                    # measure significance
                    pval = stats.binom_test(top_ct,
                                            n=top_n,
                                            p=bottom_prob,
                                            alternative='greater')
                    #output_list.append([pattern, top_ct, bottom_prob, thresh, len(testing)-2*thresh, pval])
                    output_list.append(
                        [pattern, top_ct, bottom_prob, thresh, bot_n, pval])

    # compile results
    mycols = [
        'motif', 'top ct', 'background p', 'top total', 'bot total', 'pvalue'
    ]
    result = pd.DataFrame(output_list, columns=mycols)
    result['log(OR)'] = np.log2(
        (result['top ct'] / (result['top total'] - result['top ct'])) /
        (result['background p'] / (1 - result['background p'])))
    result['adjusted pvalue'] = result['pvalue'] * len(result)
    result.loc[result['adjusted pvalue'] > 1, 'adjusted pvalue'] = 1
    result['qvalue'] = pvalue.bh_fdr(result['pvalue'])
    result.sort_values('pvalue', inplace=True)

    return result