示例#1
0
        # poisson binomial model
        footprint_pbm, gamma, pi, parameters = poisson_binomial_model(boundreads, model=model, restarts=1)
        logodds_pbm = logposteriorodds_poissonbinomial(undecidedreads, gamma, pi, parameters)
        logodds_pbm[logodds_pbm>=MAX] = logodds_pbm[logodds_pbm<MAX].max()
        logodds_pbm[logodds_pbm==-np.inf] = logodds_pbm[logodds_pbm!=-np.inf].min()
        print "learned models ..."

        Rmult = stats.pearsonr(logodds_mult, np.sqrt(chipreads))
        Rpbm = stats.pearsonr(logodds_pbm, np.sqrt(chipreads))
        R = stats.pearsonr(logodds_mult, logodds_pbm)
        print Rmult, Rpbm, R
 
        figure = viz.plot.figure()
        subplot = figure.add_subplot(111)
        subplot.scatter(logodds_mult, np.sqrt(chipreads), s=5, marker='.')
        handle.savefig(figure)

        figure = viz.plot.figure()
        subplot = figure.add_subplot(111)
        subplot.scatter(logodds_pbm, np.sqrt(chipreads), s=5, marker='.')
        handle.savefig(figure)

        if model=='modelB':
            footprints = (footprint_mult, footprint_pbm)
            figure = viz.plot_footprint(footprints, ['multinomial','poisson_binomial'], motif=motif, title='%d bp'%width)
            handle.savefig(figure)

    handle.close()
    dnaseobj.close()
        footprint_pbm, gamma, pi, parameters = poisson_binomial_model(boundreads, model=model, restarts=1)
        logodds_pbm = logposteriorodds_poissonbinomial(undecidedreads, gamma, pi, parameters)
        logodds_pbm[logodds_pbm >= MAX] = logodds_pbm[logodds_pbm < MAX].max()
        logodds_pbm[logodds_pbm == -np.inf] = logodds_pbm[logodds_pbm != -np.inf].min()
        print "learned models ..."

        Rmult = stats.pearsonr(logodds_mult, np.sqrt(chipreads))
        Rpbm = stats.pearsonr(logodds_pbm, np.sqrt(chipreads))
        R = stats.pearsonr(logodds_mult, logodds_pbm)
        print Rmult, Rpbm, R

        figure = viz.plot.figure()
        subplot = figure.add_subplot(111)
        subplot.scatter(logodds_mult, np.sqrt(chipreads), s=5, marker=".")
        handle.savefig(figure)

        figure = viz.plot.figure()
        subplot = figure.add_subplot(111)
        subplot.scatter(logodds_pbm, np.sqrt(chipreads), s=5, marker=".")
        handle.savefig(figure)

        if model == "modelB":
            footprints = (footprint_mult, footprint_pbm)
            figure = viz.plot_footprint(
                footprints, ["multinomial", "poisson_binomial"], motif=motif, title="%d bp" % width
            )
            handle.savefig(figure)

    handle.close()
    dnaseobj.close()
示例#3
0
def plotmodel(pwmid, sample=None, pwmbase='transfac'):

    import centipede_pbm as centipede
    from matplotlib.backends.backend_pdf import PdfPages

    if pwmbase == 'transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase == 'selex':
        pwms = loadutils.selex_pwms()

    models = ['modelA', 'modelB']
    meanfootprints = []
    stdfootprints = []
    Logodds = []

    handle = open(
        '/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed' %
        loadutils.factormap[pwmid], 'r')
    calls = [line.strip().split()[:3] for line in handle]
    handle.close()
    macs = dict([(chrom, []) for chrom in utils.chromosomes[:22]])
    [
        macs[call[0]].append([int(call[1]), int(call[2])]) for call in calls
        if call[0] in utils.chromosomes[:22]
    ]

    if sample is None:
        statsfile = "%s/fig/stats_short_%s.txt" % (projpath, pwmid)
    else:
        statsfile = "%s/fig/stats_short_%s_%s.txt" % (projpath, pwmid, sample)

    pis = []
    gammas = []
    outhandle = open(statsfile, 'w')

    for model in models:
        if sample is None:
            handle = open(
                "%s/cache/combined/pbmcentipede_%s_short_%s.pkl" %
                (projpath, model, pwmid), 'r')
        else:
            handle = open(
                "%s/cache/separate/pbmcentipede_%s_short_%s_%s.pkl" %
                (projpath, model, pwmid, sample), 'r')
        output = cPickle.load(handle)
        handle.close()
        footparams = output['footprint'][0]
        alpha, tau = output['negbin'][0]
        posterior = output['posterior'][0]
        logodds = np.log(posterior[:, 1] / posterior[:, 0])
        logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
        logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
        Logodds.append(logodds)
        means = alpha * (1 - tau) / tau
        outhandle.write('%.2f %.2f\n' % (means[0], means[1]))

        if not 'cascade' in locals():
            locs_tolearn = output['locations']
            dnaseobj = loadutils.Dnase(sample=sample)
            dnasereads, ig, ig = dnaseobj.getreads(locs_tolearn,
                                                   width=max([200, L / 2]))
            if L < 400:
                reads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4],
                                   dnasereads[:, 300 - L / 4:300 + L / 4]))
            else:
                reads = dnasereads
            dnasereads = dnasereads.sum(1)
            dnaseobj.close()

            cascade = centipede.Cascade(L)
            cascade.setreads(reads)
            del reads

        if model == 'modelA':
            gammas.append(footparams[0])
            if isinstance(footparams[1], centipede.Pi):
                pi = footparams[1].estim
            else:
                pi = footparams[1]
            pis.append(pi)
            B = footparams[2]
            M1, M2 = centipede.bayes_optimal_estimator(cascade,
                                                       posterior,
                                                       pi,
                                                       B=B,
                                                       model=model)
            meanfoot = M1.inverse_transform()
            stdfoot = (M2.inverse_transform() - meanfoot**2)**0.5
            meanfootprints.append(meanfoot)
            #            stdfootprints.append(stdfoot)
            stdfootprints.append(None)
        elif model == 'modelB':
            gammas.append(footparams[1])
            if isinstance(footparams[2], centipede.Pi):
                pi = footparams[2].estim
            else:
                pi = footparams[2]
            pis.append(pi)
            mu = footparams[3]
            M1, M2 = centipede.bayes_optimal_estimator(cascade,
                                                       posterior,
                                                       pi,
                                                       mu=mu,
                                                       model=model)
            meanfoot = M1.inverse_transform()
            stdfoot = (M2.inverse_transform() - meanfoot**2)**0.5
            meanfootprints.append(meanfoot)
            #            stdfootprints.append(stdfoot)
            stdfootprints.append(None)

    chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid])
    controlobj = loadutils.ChipSeq('Gm12878', loadutils.controlmap[pwmid])
    chipreads = chipobj.get_total_reads(locs_tolearn, width=200)
    controlreads = controlobj.get_total_reads(locs_tolearn, width=200)
    chipobj.close()
    controlobj.close()
    pdb.set_trace()

    #    sequence = loadutils.Sequence(sample)
    #    seqs = sequence.get_sequences(locs_tolearn, width=200)
    #    sequence.close()
    #    pdb.set_trace()
    #    np.savez('tostudy.npz', seq=np.array(seqs), dnase=dnasereads, chip=chipreads)
    #    pdb.set_trace()

    corrC = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(chipreads))
    corrD = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(controlreads))

    handle = open(
        "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_short_%s_%s.pkl"
        % (pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    footprint = output['footprint'][0]
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:, 1] / posterior[:, 0])
    logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
    logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
    Logodds.append(logodds)
    meanfootprints.append(footprint)
    stdfootprints.append(None)

    handle = open(
        "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_damped_short_%s_%s.pkl"
        % (pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    footprint = output['footprint'][0]
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:, 1] / posterior[:, 0])
    logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
    logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
    Logodds.append(logodds)
    meanfootprints.append(footprint)
    stdfootprints.append(None)

    handle = open(
        "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_nofoot_short_%s_%s.pkl"
        % (pwmid, sample), 'r')
    output = cPickle.load(handle)
    handle.close()
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:, 1] / posterior[:, 0])
    logodds[logodds == np.inf] = logodds[logodds != np.inf].max()
    logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min()
    Logodds.append(logodds)

    key = [k for k, pwm in pwms.iteritems() if pwm['AC'] == pwmid][0]
    if sample is None:
        title = pwms[key]['NA']
        footprintfile = "%s/fig/footprint_short_%s.pdf" % (projpath, pwmid)
        corrfile = "%s/fig/logoddsCorr_short_%s.pdf" % (projpath, pwmid)
    else:
        title = "%s / %s" % (pwms[key]['NA'], sample)
        footprintfile = "%s/fig/footprint_short_%s_%s.pdf" % (projpath, pwmid,
                                                              sample)
        corrfile = "%s/fig/logoddsCorr_short_%s_%s.pdf" % (projpath, pwmid,
                                                           sample)

    models = [
        'CentipedePBM_M1', 'CentipedePBM_M2', 'Centipede', 'CentipedeDamped'
    ]
    # plot footprints
    pdfhandle = PdfPages(footprintfile)
    figure = viz.plot_footprint(meanfootprints,
                                labels=models,
                                stderr=stdfootprints,
                                motif=pwms[key]['motif'],
                                title=title)
    pdfhandle.savefig(figure)
    models.append('CentipedeNoFoot')
    auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads,
                                                    Logodds[0], macs,
                                                    locs_tolearn)
    figure = viz.plot_auc(Logodds,
                          positive,
                          negative,
                          labels=models,
                          title=title)
    pdfhandle.savefig(figure)
    T = pis[0].size
    figure = viz.plot.figure()
    subplot = figure.add_subplot(111)
    subplot.scatter(gammas[0].value[0],
                    gammas[1].value[0],
                    s=2**T,
                    marker='o',
                    color=viz.colors[1],
                    label='gamma',
                    alpha=0.5)
    subplot.scatter(pis[0][0],
                    pis[1][0],
                    s=2**T,
                    marker='o',
                    color=viz.colors[0],
                    label='pi',
                    alpha=0.5)
    for i in xrange(1, T):
        subplot.scatter(gammas[0].value[i],
                        gammas[1].value[i],
                        s=2**(T - i),
                        marker='o',
                        color=viz.colors[1],
                        label='_nolabel_',
                        alpha=0.5)
        subplot.scatter(pis[0][i],
                        pis[1][i],
                        s=2**(T - i),
                        marker='o',
                        color=viz.colors[0],
                        label='_nolabel_',
                        alpha=0.5)
    xmin = min([pis[0].min(), pis[1].min()]) - 0.05
    xmax = max([pis[0].max(), pis[1].max()]) + 0.05
    subplot.axis([xmin, xmax, xmin, xmax])
    subplot.set_xlabel('PBM_M1')
    subplot.set_ylabel('PBM_M2')
    legend = subplot.legend(loc=1)
    for text in legend.texts:
        text.set_fontsize('8')
    legend.set_frame_on(False)
    pdfhandle.savefig(figure)
    pdfhandle.close()
    pdb.set_trace()

    pdfhandle = PdfPages(corrfile)
    lo = 0
    for logodds, model in zip(Logodds, models):
        auc, tpr, positive, negative = compute_chip_auc(
            chipreads, controlreads, logodds, macs, locs_tolearn)
        corrA = stats.pearsonr(logodds, np.sqrt(chipreads))
        corrB = stats.pearsonr(logodds, np.sqrt(controlreads))
        corra = stats.pearsonr(logodds[logodds > lo],
                               np.sqrt(chipreads)[logodds > lo])
        corrb = stats.pearsonr(logodds[logodds > lo],
                               np.sqrt(controlreads)[logodds > lo])
        corrc = stats.pearsonr(
            np.sqrt(dnasereads)[logodds > lo],
            np.sqrt(chipreads)[logodds > lo])
        corrd = stats.pearsonr(
            np.sqrt(dnasereads)[logodds > lo],
            np.sqrt(controlreads)[logodds > lo])
        towrite = [
            pwmid, model, corrA, corrB, corrC, corrD, corra, corrb, corrc,
            corrd, auc, tpr, logodds.size, (logodds > np.log(99)).sum()
        ]
        outhandle.write(' '.join(map(str, towrite)) + '\n')
        figure = viz.plot_correlation(np.sqrt(chipreads), logodds, title=model)
        pdfhandle.savefig(figure)

    figure = viz.plot_correlation(np.sqrt(chipreads),
                                  np.sqrt(dnasereads),
                                  xlabel='sqrt(dnase reads)',
                                  title='Total Dnase reads')
    pdfhandle.savefig(figure)
    pdfhandle.close()
    outhandle.close()
示例#4
0
def plotmodel(pwmid, sample=None, pwmbase='transfac'):

    import centipede_pbm as centipede
    from matplotlib.backends.backend_pdf import PdfPages

    if pwmbase=='transfac':
        pwms = loadutils.transfac_pwms()
    elif pwmbase=='selex':
        pwms = loadutils.selex_pwms()

    models = ['modelA','modelB']
    meanfootprints = []
    stdfootprints = []
    Logodds = []

    handle = open('/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed'%loadutils.factormap[pwmid],'r')
    calls = [line.strip().split()[:3] for line in handle]
    handle.close()
    macs = dict([(chrom,[]) for chrom in utils.chromosomes[:22]])
    [macs[call[0]].append([int(call[1]),int(call[2])]) for call in calls if call[0] in utils.chromosomes[:22]]

    if sample is None:
        statsfile = "%s/fig/stats_short_%s.txt"%(projpath,pwmid)
    else:
        statsfile = "%s/fig/stats_short_%s_%s.txt"%(projpath,pwmid,sample)

    pis = []
    gammas = []
    outhandle = open(statsfile,'w')

    for model in models:
        if sample is None:
            handle = open("%s/cache/combined/pbmcentipede_%s_short_%s.pkl"%(projpath,model,pwmid),'r')
        else:
            handle = open("%s/cache/separate/pbmcentipede_%s_short_%s_%s.pkl"%(projpath,model,pwmid,sample),'r')
        output = cPickle.load(handle)
        handle.close()
        footparams = output['footprint'][0]
        alpha, tau = output['negbin'][0]
        posterior = output['posterior'][0]
        logodds = np.log(posterior[:,1]/posterior[:,0])
        logodds[logodds==np.inf] = logodds[logodds!=np.inf].max()
        logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min()
        Logodds.append(logodds)
        means = alpha*(1-tau)/tau
        outhandle.write('%.2f %.2f\n'%(means[0],means[1]))

        if not 'cascade' in locals():
            locs_tolearn = output['locations']
            dnaseobj = loadutils.Dnase(sample=sample)            
            dnasereads, ig, ig = dnaseobj.getreads(locs_tolearn, width=max([200,L/2]))
            if L<400:
                reads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4]))
            else:
                reads = dnasereads
            dnasereads = dnasereads.sum(1)
            dnaseobj.close()

            cascade = centipede.Cascade(L)
            cascade.setreads(reads)
            del reads

        if model=='modelA':
            gammas.append(footparams[0])
            if isinstance(footparams[1],centipede.Pi):
                pi = footparams[1].estim
            else:
                pi = footparams[1]
            pis.append(pi)
            B = footparams[2]
            M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, B=B, model=model)
            meanfoot = M1.inverse_transform()
            stdfoot = (M2.inverse_transform()-meanfoot**2)**0.5
            meanfootprints.append(meanfoot)
#            stdfootprints.append(stdfoot)
            stdfootprints.append(None)
        elif model=='modelB':
            gammas.append(footparams[1])
            if isinstance(footparams[2],centipede.Pi):
                pi = footparams[2].estim
            else:
                pi = footparams[2]
            pis.append(pi)
            mu = footparams[3]
            M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, mu=mu, model=model)
            meanfoot = M1.inverse_transform()
            stdfoot = (M2.inverse_transform()-meanfoot**2)**0.5
            meanfootprints.append(meanfoot)
#            stdfootprints.append(stdfoot)
            stdfootprints.append(None)

    chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid])
    controlobj = loadutils.ChipSeq('Gm12878',loadutils.controlmap[pwmid])
    chipreads = chipobj.get_total_reads(locs_tolearn, width=200)
    controlreads = controlobj.get_total_reads(locs_tolearn, width=200)
    chipobj.close()
    controlobj.close()
    pdb.set_trace()

#    sequence = loadutils.Sequence(sample)
#    seqs = sequence.get_sequences(locs_tolearn, width=200)
#    sequence.close()
#    pdb.set_trace()
#    np.savez('tostudy.npz', seq=np.array(seqs), dnase=dnasereads, chip=chipreads)
#    pdb.set_trace()

    corrC = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(chipreads))
    corrD = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(controlreads))

    handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_short_%s_%s.pkl"%(pwmid,sample),'r')
    output = cPickle.load(handle)
    handle.close()
    footprint = output['footprint'][0]
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:,1]/posterior[:,0])
    logodds[logodds==np.inf] = logodds[logodds!=np.inf].max()
    logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min()
    Logodds.append(logodds)
    meanfootprints.append(footprint)
    stdfootprints.append(None)

    handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_damped_short_%s_%s.pkl"%(pwmid,sample),'r')
    output = cPickle.load(handle)
    handle.close()
    footprint = output['footprint'][0]
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:,1]/posterior[:,0])
    logodds[logodds==np.inf] = logodds[logodds!=np.inf].max()
    logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min()
    Logodds.append(logodds)
    meanfootprints.append(footprint)
    stdfootprints.append(None)

    handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_nofoot_short_%s_%s.pkl"%(pwmid,sample),'r')
    output = cPickle.load(handle)
    handle.close()
    posterior = output['posterior'][0]
    logodds = np.log(posterior[:,1]/posterior[:,0])
    logodds[logodds==np.inf] = logodds[logodds!=np.inf].max()
    logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min()
    Logodds.append(logodds)

    key = [k for k,pwm in pwms.iteritems() if pwm['AC']==pwmid][0]
    if sample is None:
        title = pwms[key]['NA']
        footprintfile = "%s/fig/footprint_short_%s.pdf"%(projpath,pwmid)
        corrfile = "%s/fig/logoddsCorr_short_%s.pdf"%(projpath,pwmid)
    else:
        title = "%s / %s"%(pwms[key]['NA'], sample)
        footprintfile = "%s/fig/footprint_short_%s_%s.pdf"%(projpath,pwmid,sample)
        corrfile = "%s/fig/logoddsCorr_short_%s_%s.pdf"%(projpath,pwmid,sample)

    models = ['CentipedePBM_M1','CentipedePBM_M2','Centipede','CentipedeDamped']
    # plot footprints
    pdfhandle = PdfPages(footprintfile)
    figure = viz.plot_footprint(meanfootprints, labels=models, stderr=stdfootprints, motif=pwms[key]['motif'], title=title)
    pdfhandle.savefig(figure)
    models.append('CentipedeNoFoot')
    auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, Logodds[0], macs, locs_tolearn)
    figure = viz.plot_auc(Logodds, positive, negative, labels=models, title=title)
    pdfhandle.savefig(figure)
    T = pis[0].size
    figure = viz.plot.figure()
    subplot = figure.add_subplot(111)
    subplot.scatter(gammas[0].value[0], gammas[1].value[0], s=2**T, marker='o', color=viz.colors[1], label='gamma', alpha=0.5)
    subplot.scatter(pis[0][0], pis[1][0], s=2**T, marker='o', color=viz.colors[0], label='pi', alpha=0.5)
    for i in xrange(1,T):
        subplot.scatter(gammas[0].value[i], gammas[1].value[i], s=2**(T-i), marker='o', color=viz.colors[1], label='_nolabel_', alpha=0.5)
        subplot.scatter(pis[0][i], pis[1][i], s=2**(T-i), marker='o', color=viz.colors[0], label='_nolabel_', alpha=0.5)
    xmin = min([pis[0].min(), pis[1].min()])-0.05
    xmax = max([pis[0].max(), pis[1].max()])+0.05
    subplot.axis([xmin, xmax, xmin, xmax])
    subplot.set_xlabel('PBM_M1')
    subplot.set_ylabel('PBM_M2')
    legend = subplot.legend(loc=1)
    for text in legend.texts:
        text.set_fontsize('8')
    legend.set_frame_on(False)
    pdfhandle.savefig(figure)
    pdfhandle.close()
    pdb.set_trace()

    pdfhandle = PdfPages(corrfile)
    lo = 0
    for logodds,model in zip(Logodds,models):
        auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, logodds, macs, locs_tolearn)
        corrA = stats.pearsonr(logodds, np.sqrt(chipreads))
        corrB = stats.pearsonr(logodds, np.sqrt(controlreads))
        corra = stats.pearsonr(logodds[logodds>lo], np.sqrt(chipreads)[logodds>lo])
        corrb = stats.pearsonr(logodds[logodds>lo], np.sqrt(controlreads)[logodds>lo])
        corrc = stats.pearsonr(np.sqrt(dnasereads)[logodds>lo], np.sqrt(chipreads)[logodds>lo])
        corrd = stats.pearsonr(np.sqrt(dnasereads)[logodds>lo], np.sqrt(controlreads)[logodds>lo])
        towrite = [pwmid, model, corrA, corrB, corrC, corrD, corra, corrb, corrc, corrd, auc, tpr, logodds.size, (logodds>np.log(99)).sum()]
        outhandle.write(' '.join(map(str,towrite))+'\n')
        figure = viz.plot_correlation(np.sqrt(chipreads), logodds, title=model)
        pdfhandle.savefig(figure)

    figure = viz.plot_correlation(np.sqrt(chipreads), np.sqrt(dnasereads), xlabel='sqrt(dnase reads)', title='Total Dnase reads')
    pdfhandle.savefig(figure)
    pdfhandle.close()
    outhandle.close()