Exemplo n.º 1
0
def analyze_mi_tests(prok_tests, euk_tests):
    pass
    prok_q = fdr(concat(prok_tests))
    euk_q = fdr(concat(euk_tests))
    prok_correlated_percentage = count(lambda x:x <= prok_q,(concat(prok_tests)))/float(len(concat(prok_tests)))
    euk_correlated_percentage = count(lambda x:x <= euk_q,(concat(euk_tests)))/float(len(concat(euk_tests)))
    prok_ds = [[j - i for (i, coli), (j,colj) in choose2(list(enumerate(transpose(motif))))]
               for motif in prok_motifs]
    euk_ds = [[j - i for (i, coli), (j,colj) in choose2(list(enumerate(transpose(motif))))]
               for motif in euk_motifs]
    def binom_ci(xs):
        """return width of error bar"""
        bs_means = sorted([mean(bs(xs)) for x in range(1000)])
        mu = mean(xs)
        return (mu - bs_means[25], bs_means[975] - mu)
    prok_cis = [binom_ci([t <= prok_q for t,d in zip(concat(prok_tests), concat(prok_ds)) if d == i])
                for i in trange(1,20)]
    euk_cis = [binom_ci([t <= euk_q for t,d in zip(concat(euk_tests), concat(euk_ds)) if d == i])
                for i in trange(1,20)]
    plt.errorbar(range(1,20),
                 [mean([t <= prok_q for t,d in zip(concat(prok_tests), concat(prok_ds)) if d == i])
                  for i in range(1,20)],yerr=transpose(prok_cis),label="Prokaryotic Motifs",capthick=1)
    plt.errorbar(range(1,20),
                 [mean([t <= euk_q for t,d in zip(concat(euk_tests), concat(euk_ds)) if d == i])
                  for i in range(1,20)],yerr=transpose(euk_cis),label="Eukaryotic Motifs",capthick=1)
    plt.xlabel("Distance (bp)",fontsize="large")
    plt.ylabel("Proportion of Significant Correlations",fontsize="large")
    plt.legend(fontsize='large')
Exemplo n.º 2
0
def sanity_check_analyze_correlated_digrams(motifs):
    digrams = defaultdict(int)
    adj_digrams = defaultdict(int)
    for motif in motifs:
        for ((i,coli),(j,colj)) in choose2(list(enumerate(transpose((motif))))):
            for bi,bj in transpose((coli,colj)):
                digrams[(bi,bj)] += 1
                if j == i + 1:
                    adj_digrams[(bi,bj)] += 1
    return digrams, adj_digrams
Exemplo n.º 3
0
def maxent_motif(N,
                 L,
                 desired_ic,
                 tolerance=10**-10,
                 beta=None,
                 verbose=False,
                 A=4):
    """sample motif from max ent distribution with mean desired_ic"""
    # first we adjust the desired ic upwards so that when motif_ic is
    # called with 1st order correction, we get the desired ic.
    if beta is None:
        if verbose:
            print "finding beta"
        correction_per_col = (A - 1) / (2 * log(2) * N)
        desired_ic += L * correction_per_col
        beta = find_beta_for_mean_motif_ic(N,
                                           L,
                                           desired_ic,
                                           tolerance=tolerance,
                                           verbose=verbose,
                                           A=A)
    ps = count_ps_from_beta(N, beta, A=A)
    count_sampler = inverse_cdf_sampler(enumerate_counts(N, A), ps)
    counts = [count_sampler() for i in range(L)]
    cols = [sample_col_from_count(count, A=A) for count in counts]
    return map(lambda site: "".join(site), transpose(cols))
Exemplo n.º 4
0
def uniform_motif(N,
                  L,
                  desired_ic,
                  epsilon=0.1,
                  beta=None,
                  ps=None,
                  count_sampler=None,
                  verbose=False):
    if verbose: print "uniform motif accept reject:", N, L, desired_ic, beta
    correction_per_col = 3 / (2 * log(2) * N)
    desired_ic_for_beta = desired_ic + L * correction_per_col
    if desired_ic_for_beta == 2 * L:  # if we reach the upper limit, things break down
        cols = [sample_col_from_count((0, 0, 0, N)) for _ in range(L)]
        motif_p = map(lambda site: "".join(site), transpose(cols))
        return motif_p
    if beta is None:
        beta = find_beta_for_mean_motif_ic(N, L, desired_ic_for_beta)
        if verbose:
            print "beta:", beta
    if ps is None:
        ps = count_ps_from_beta(N, beta)
    if count_sampler is None:
        count_sampler = inverse_cdf_sampler(enumerate_counts(N), ps)

    def rQ_raw():
        counts = [count_sampler() for i in range(L)]
        cols = [sample_col_from_count(count) for count in counts]
        motif_p = map(lambda site: "".join(site), transpose(cols))
        return motif_p

    def rQ():
        return sample_until(lambda M: inrange(M, desired_ic, epsilon),
                            rQ_raw,
                            1,
                            progress_bar=False)[0]

    def dQhat(motif):
        return exp(beta * motif_ic(motif))

    Imin = desired_ic - epsilon
    Imax = desired_ic + epsilon
    log_M = -beta * Imin
    if verbose: print "Imin, Imax, log_M:", Imin, Imax, log_M

    def dQ(motif):
        return exp(beta * motif_ic(motif) + log_M)

    def AR(motif):
        return 1.0 / dQ(motif)

    #M = exp(-beta*(desired_ic - epsilon)) # which ic? +/- correction
    trials = 0
    while True:
        trials += 1
        motif = rQ()
        r = random.random()
        if r < AR(motif):
            return motif
        if verbose and trials % 100 == 0:
            print trials, AR(motif)
Exemplo n.º 5
0
def analyze_mi_tests2(tests, motifs, q=None, label=None):
    q = fdr(concat(tests))
    correlated_percentage = count(lambda x:x <= q,(concat(tests)))/float(len(concat(tests)))
    ds = [[j - i for (i, coli), (j,colj) in choose2(list(enumerate(transpose(motif))))]
               for motif in motifs]
    def binom_ci(xs):
        """return width of error bar"""
        bs_means = sorted([mean(bs(xs)) for x in range(1000)])
        mu = mean(xs)
        return (mu - bs_means[25], bs_means[975] - mu)
    tests_by_dist = [[t <= q for t,d in zip(concat(tests), concat(ds)) if d == i] for i in range(1, 20)]
    mean_vals = map(lambda xs:mean(xs) if xs else 0, tests_by_dist)
    cis = map(lambda xs:binom_ci(xs) if xs else (0,0), tests_by_dist)
    plt.errorbar(range(1,20),
                 mean_vals,yerr=transpose(cis),label=label,capthick=1)
    plt.xlabel("Distance (bp)",fontsize="large")
    plt.ylabel("Proportion of Significant Correlations",fontsize="large")
    plt.legend()
Exemplo n.º 6
0
def make_correlation_structure_by_cluster_figure():
    from motif_clustering import cluster_motif
    q = fdr(concat(euk_tests))
    euk_clusterses = [map(cluster_motif, tqdm(euk_motifs)) for i in range(3)]
    plt.close() # get rid of output from cluster_motif
    mean_lens = map(lambda xs:round(mean(xs)), transpose([map(len,cs) for cs in euk_clusterses]))
    jss = [indices_where(mean_lens, lambda x:x==i) for i in range(1, 5+1)]
    for i,js in tqdm(enumerate(jss)):
        analyze_mi_tests2(rslice(euk_tests, js), rslice(euk_motifs, js), label=str(i+1), q=q)
Exemplo n.º 7
0
def maxent_motif(N,L,desired_ic,tolerance=10**-10,beta=None,verbose=False, A=4):
    """sample motif from max ent distribution with mean desired_ic"""
    # first we adjust the desired ic upwards so that when motif_ic is
    # called with 1st order correction, we get the desired ic.
    if beta is None:
        if verbose:
            print "finding beta"
        correction_per_col = (A-1)/(2*log(2)*N)
        desired_ic += L * correction_per_col
        beta = find_beta_for_mean_motif_ic(N,L,desired_ic,tolerance=tolerance,verbose=verbose, A=A)
    ps = count_ps_from_beta(N, beta, A=A)
    count_sampler = inverse_cdf_sampler(enumerate_counts(N, A), ps)
    counts = [count_sampler() for i in range(L)]
    cols = [sample_col_from_count(count, A=A) for count in counts]
    return map(lambda site:"".join(site),transpose(cols))
Exemplo n.º 8
0
def uniform_motif(N,L,desired_ic,epsilon=0.1,beta=None,ps=None,count_sampler=None,verbose=False):
    if verbose:  print "uniform motif accept reject:",N,L,desired_ic,beta
    correction_per_col = 3/(2*log(2)*N)
    desired_ic_for_beta = desired_ic + L * correction_per_col
    if desired_ic_for_beta == 2*L: # if we reach the upper limit, things break down
        cols = [sample_col_from_count((0,0,0,N)) for _ in range(L)]
        motif_p = map(lambda site:"".join(site),transpose(cols))
        return motif_p
    if beta is None:
        beta = find_beta_for_mean_motif_ic(N,L,desired_ic_for_beta)
        if verbose:
            print "beta:",beta
    if ps is None:
        ps = count_ps_from_beta(N,beta)
    if count_sampler is None:
        count_sampler = inverse_cdf_sampler(enumerate_counts(N),ps)
    def rQ_raw():
        counts = [count_sampler() for i in range(L)]
        cols = [sample_col_from_count(count) for count in counts]
        motif_p = map(lambda site:"".join(site),transpose(cols))
        return motif_p
    def rQ():
        return sample_until(lambda M:inrange(M,desired_ic,epsilon),rQ_raw,1,progress_bar=False)[0]
    def dQhat(motif):
        return exp(beta*motif_ic(motif))
    Imin = desired_ic - epsilon
    Imax = desired_ic + epsilon
    log_M = -beta*Imin
    if verbose: print "Imin, Imax, log_M:",Imin, Imax, log_M
    def dQ(motif):
        return exp(beta*motif_ic(motif) + log_M)
    def AR(motif):
        return 1.0/dQ(motif)
    #M = exp(-beta*(desired_ic - epsilon)) # which ic? +/- correction
    trials = 0
    while True:
        trials +=1
        motif = rQ()
        r = random.random()
        if r < AR(motif):
            return motif
        if verbose and trials % 100 == 0:
            print trials, AR(motif)
Exemplo n.º 9
0
 def rQ_raw():
     counts = [count_sampler() for i in range(L)]
     cols = [sample_col_from_count(count) for count in counts]
     motif_p = map(lambda site:"".join(site),transpose(cols))
     return motif_p
Exemplo n.º 10
0
def motif_mi_distances(motif, trials=1000):
    cols = transpose(motif)
    L = len(cols)
    correlated_distances = [j-i for (i,coli), (j,colj) in choose2(list(enumerate(cols)))
                            if mi_test_cols(coli, colj)]
    return (correlated_distances, L)
Exemplo n.º 11
0
 def rQ_raw():
     counts = [count_sampler() for i in range(L)]
     cols = [sample_col_from_count(count) for count in counts]
     motif_p = map(lambda site: "".join(site), transpose(cols))
     return motif_p
Exemplo n.º 12
0
def motif_mi_col_test(motif, trials=1000):
    cols = transpose(motif)
    return sum(mi_test_cols(colA, colB) for colA, colB in choose2(cols))/float(len(choose2(cols)))
Exemplo n.º 13
0
def analyze_correlated_digrams_canonical(prok_tests, euk_tests, filename=None):
    digrams = [(b1,b2) for b1 in "ACGT" for b2 in "ACGT"]
    canonical_digrams = sorted(list(set([min(dg,tuple(wc(dg))) for dg in digrams])))
    prok_q = fdr(concat(prok_tests))
    euk_q = fdr(concat(euk_tests))
    prok_digrams = defaultdict(int)
    prok_corr_digrams = defaultdict(int)
    prok_adj_digrams = defaultdict(int)
    for tests, motif in tqdm(zip(prok_tests, prok_motifs)):
        for test, ((i,coli),(j,colj)) in zip(tests, choose2(list(enumerate(transpose((motif)))))):
            for bi,bj in transpose((coli,colj)):
                rev_comp = tuple(wc((bi,bj)))
                if (bi, bj) > rev_comp:
                    bi, bj = rev_comp
                prok_digrams[(bi,bj)] += 1
                if j == i + 1:
                    prok_adj_digrams[(bi,bj)] += 1
                if test <= prok_q:
                    prok_corr_digrams[(bi,bj)] += 1
    prok_corr_N = float(sum(prok_corr_digrams.values()))
    prok_adj_N = float(sum(prok_adj_digrams.values()))
    prok_N = float(sum(prok_digrams.values()))
    #prok_ps = normalize(prok_digrams.values())
    #prok_adj_ps = normalize(prok_adj_digrams.values())
    #prok_corr_ps = normalize(prok_corr_digrams.values())
    prok_ps = normalize([prok_digrams[dg] for dg in canonical_digrams])
    prok_adj_ps = normalize([prok_adj_digrams[dg] for dg in canonical_digrams])
    prok_corr_ps = normalize([prok_corr_digrams[dg] for dg in canonical_digrams])
    prok_yerr = [1.96*sqrt(1.0/prok_N*p*(1-p)) for p in prok_ps]
    prok_adj_yerr = [1.96*sqrt(1.0/prok_adj_N*p*(1-p)) for p in prok_adj_ps]
    prok_corr_yerr = [1.96*sqrt(1.0/prok_corr_N*p*(1-p)) for p in prok_corr_ps]

    euk_digrams = defaultdict(int)
    euk_corr_digrams = defaultdict(int)
    euk_adj_digrams = defaultdict(int)
    for tests, motif in tqdm(zip(euk_tests, euk_motifs)):
        for test, ((i,coli),(j,colj)) in zip(tests, choose2(list(enumerate(transpose((motif)))))):
            for bi,bj in transpose((coli,colj)):
                rev_comp = tuple(wc((bi,bj)))
                if (bi, bj) > rev_comp:
                    bi, bj = rev_comp
                euk_digrams[(bi,bj)] += 1
                if j == i + 1:
                    euk_adj_digrams[(bi,bj)] += 1
                if test <= euk_q:
                    euk_corr_digrams[(bi,bj)] += 1
    euk_corr_N = float(sum(euk_corr_digrams.values()))
    euk_adj_N = float(sum(euk_adj_digrams.values()))
    euk_N = float(sum(euk_digrams.values()))
    # euk_ps = normalize(euk_digrams.values())
    # euk_adj_ps = normalize(euk_adj_digrams.values())
    # euk_corr_ps = normalize(euk_corr_digrams.values())
    euk_ps = normalize([euk_digrams[dg] for dg in canonical_digrams])
    euk_adj_ps = normalize([euk_adj_digrams[dg] for dg in canonical_digrams])
    euk_corr_ps = normalize([euk_corr_digrams[dg] for dg in canonical_digrams])
    euk_yerr = [1.96*sqrt(1.0/euk_N*p*(1-p)) for p in euk_ps]
    euk_adj_yerr = [1.96*sqrt(1.0/euk_adj_N*p*(1-p)) for p in euk_adj_ps]
    euk_corr_yerr = [1.96*sqrt(1.0/euk_corr_N*p*(1-p)) for p in euk_corr_ps]

    palette = sns.cubehelix_palette(4)
    ax = plt.subplot(211)
    # plt.bar(range(16),normalize(prok_digrams.values()))
    # plt.bar(range(16),normalize(prok_corr_digrams.values()),color='g')
    # plt.bar([x-0.2 for x in range(16)], prok_relative_ratios.values(), color='g', label="Correlated Column-pairs",width=0.2)
    # plt.bar([x for x in range(16)],prok_adj_relative_ratios.values(),color='r',alpha=1,yerr=prok_adj_yerr,label="Adjacent Column-pairs",width=0.2)
    # plt.bar([x+0.2 for x in range(16)],[1]*16,color='b',alpha=1,yerr=(prok_yerr),capsize=10,capstyle='butt',label="All Column-pairs",width=0.2)
    plt.bar([x-0.2 for x in range(len(canonical_digrams))], prok_ps, label="All Column-Pairs",width=0.2,yerr=prok_yerr,color=palette[0])
    plt.bar([x for x in range(len(canonical_digrams))],prok_adj_ps,label="Adj. Column-Pairs",
            width=0.2,yerr=prok_adj_yerr,color=palette[1])
    plt.bar([x+0.2 for x in range(len(canonical_digrams))],prok_corr_ps,alpha=1,
            capstyle='butt',label="Corr. Adj. Column-Pairs",width=0.2,yerr=prok_corr_yerr,color=palette[3])
    #plt.plot([0,16],[1.0/16, 1.0/16],linestyle='--',color=palette[3],label="Equiprobability",linewidth=1)
    ax.set_xticks([x for x in range(len(canonical_digrams))])
    ax.set_xticklabels( ["".join(dg) for dg in canonical_digrams],fontsize='large')
    plt.xlim(-0.5,10.5)
    plt.ylim(0,0.3)
    #plt.xlabel("Dimer",fontsize='large')
    plt.ylabel("Prokaryotic Frequency",fontsize='large')
    #plt.ylim(0,2)
    plt.legend(loc='upper right')
    
    ax2 = plt.subplot(212)
    #plt.plot([0,16],[1.0/16, 1.0/16],linestyle='--',color=palette[3],label="Equiprobability",linewidth=1)
    plt.bar([x-0.2 for x in range(len(canonical_digrams))], euk_ps, label="All Column-Pairs",width=0.2,yerr=euk_yerr,color=palette[0])
    plt.bar([x for x in range(len(canonical_digrams))],euk_adj_ps,label="Adj. Column-Pairs",
            width=0.2,yerr=euk_adj_yerr,color=palette[1])
    plt.bar([x+0.2 for x in range(len(canonical_digrams))],euk_corr_ps,alpha=1,
            capstyle='butt',label="Corr. Adj. Column-Pairs",width=0.2,yerr=euk_corr_yerr,color=palette[3])
    ax2.set_xticks([x for x in range(len(canonical_digrams))])
    ax2.set_xticklabels( ["".join(dg) for dg in canonical_digrams],fontsize='large')
    #plt.xlabel("Dimer",fontsize='large')
    plt.xlim(-0.5,10.5)
    plt.ylim(0,0.2)
    plt.ylabel("Eukaryotic Frequency",fontsize='large')
    #plt.ylim(0,2)
    plt.legend(loc='upper right')
    maybesave(filename)
Exemplo n.º 14
0
 def sample():
     counts = [count_sampler() for i in range(L)]
     cols = [sample_col_from_count(count, A=A) for count in counts]
     return map(lambda site: "".join(site), transpose(cols))
Exemplo n.º 15
0
def motif_test_cols(motif):
    cols = transpose(motif)
    return [mi_test_cols(colA, colB, alpha=None) for colA, colB in choose2(cols)]
Exemplo n.º 16
0
def motif_mi_dist(motif):
    cols = transpose(motif)
    return [dna_mi(colA, colB) for colA, colB in choose2(cols)]
Exemplo n.º 17
0
 def sample():
     counts = [count_sampler() for i in range(L)]
     cols = [sample_col_from_count(count, A=A) for count in counts]
     return map(lambda site:"".join(site),transpose(cols))