# poisson binomial model footprint_pbm, gamma, pi, parameters = poisson_binomial_model(boundreads, model=model, restarts=1) logodds_pbm = logposteriorodds_poissonbinomial(undecidedreads, gamma, pi, parameters) logodds_pbm[logodds_pbm>=MAX] = logodds_pbm[logodds_pbm<MAX].max() logodds_pbm[logodds_pbm==-np.inf] = logodds_pbm[logodds_pbm!=-np.inf].min() print "learned models ..." Rmult = stats.pearsonr(logodds_mult, np.sqrt(chipreads)) Rpbm = stats.pearsonr(logodds_pbm, np.sqrt(chipreads)) R = stats.pearsonr(logodds_mult, logodds_pbm) print Rmult, Rpbm, R figure = viz.plot.figure() subplot = figure.add_subplot(111) subplot.scatter(logodds_mult, np.sqrt(chipreads), s=5, marker='.') handle.savefig(figure) figure = viz.plot.figure() subplot = figure.add_subplot(111) subplot.scatter(logodds_pbm, np.sqrt(chipreads), s=5, marker='.') handle.savefig(figure) if model=='modelB': footprints = (footprint_mult, footprint_pbm) figure = viz.plot_footprint(footprints, ['multinomial','poisson_binomial'], motif=motif, title='%d bp'%width) handle.savefig(figure) handle.close() dnaseobj.close()
footprint_pbm, gamma, pi, parameters = poisson_binomial_model(boundreads, model=model, restarts=1) logodds_pbm = logposteriorodds_poissonbinomial(undecidedreads, gamma, pi, parameters) logodds_pbm[logodds_pbm >= MAX] = logodds_pbm[logodds_pbm < MAX].max() logodds_pbm[logodds_pbm == -np.inf] = logodds_pbm[logodds_pbm != -np.inf].min() print "learned models ..." Rmult = stats.pearsonr(logodds_mult, np.sqrt(chipreads)) Rpbm = stats.pearsonr(logodds_pbm, np.sqrt(chipreads)) R = stats.pearsonr(logodds_mult, logodds_pbm) print Rmult, Rpbm, R figure = viz.plot.figure() subplot = figure.add_subplot(111) subplot.scatter(logodds_mult, np.sqrt(chipreads), s=5, marker=".") handle.savefig(figure) figure = viz.plot.figure() subplot = figure.add_subplot(111) subplot.scatter(logodds_pbm, np.sqrt(chipreads), s=5, marker=".") handle.savefig(figure) if model == "modelB": footprints = (footprint_mult, footprint_pbm) figure = viz.plot_footprint( footprints, ["multinomial", "poisson_binomial"], motif=motif, title="%d bp" % width ) handle.savefig(figure) handle.close() dnaseobj.close()
def plotmodel(pwmid, sample=None, pwmbase='transfac'): import centipede_pbm as centipede from matplotlib.backends.backend_pdf import PdfPages if pwmbase == 'transfac': pwms = loadutils.transfac_pwms() elif pwmbase == 'selex': pwms = loadutils.selex_pwms() models = ['modelA', 'modelB'] meanfootprints = [] stdfootprints = [] Logodds = [] handle = open( '/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed' % loadutils.factormap[pwmid], 'r') calls = [line.strip().split()[:3] for line in handle] handle.close() macs = dict([(chrom, []) for chrom in utils.chromosomes[:22]]) [ macs[call[0]].append([int(call[1]), int(call[2])]) for call in calls if call[0] in utils.chromosomes[:22] ] if sample is None: statsfile = "%s/fig/stats_short_%s.txt" % (projpath, pwmid) else: statsfile = "%s/fig/stats_short_%s_%s.txt" % (projpath, pwmid, sample) pis = [] gammas = [] outhandle = open(statsfile, 'w') for model in models: if sample is None: handle = open( "%s/cache/combined/pbmcentipede_%s_short_%s.pkl" % (projpath, model, pwmid), 'r') else: handle = open( "%s/cache/separate/pbmcentipede_%s_short_%s_%s.pkl" % (projpath, model, pwmid, sample), 'r') output = cPickle.load(handle) handle.close() footparams = output['footprint'][0] alpha, tau = output['negbin'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:, 1] / posterior[:, 0]) logodds[logodds == np.inf] = logodds[logodds != np.inf].max() logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min() Logodds.append(logodds) means = alpha * (1 - tau) / tau outhandle.write('%.2f %.2f\n' % (means[0], means[1])) if not 'cascade' in locals(): locs_tolearn = output['locations'] dnaseobj = loadutils.Dnase(sample=sample) dnasereads, ig, ig = dnaseobj.getreads(locs_tolearn, width=max([200, L / 2])) if L < 400: reads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4], dnasereads[:, 300 - L / 4:300 + L / 4])) else: reads = dnasereads dnasereads = dnasereads.sum(1) dnaseobj.close() cascade = centipede.Cascade(L) cascade.setreads(reads) del reads if model == 'modelA': gammas.append(footparams[0]) if isinstance(footparams[1], centipede.Pi): pi = footparams[1].estim else: pi = footparams[1] pis.append(pi) B = footparams[2] M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, B=B, model=model) meanfoot = M1.inverse_transform() stdfoot = (M2.inverse_transform() - meanfoot**2)**0.5 meanfootprints.append(meanfoot) # stdfootprints.append(stdfoot) stdfootprints.append(None) elif model == 'modelB': gammas.append(footparams[1]) if isinstance(footparams[2], centipede.Pi): pi = footparams[2].estim else: pi = footparams[2] pis.append(pi) mu = footparams[3] M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, mu=mu, model=model) meanfoot = M1.inverse_transform() stdfoot = (M2.inverse_transform() - meanfoot**2)**0.5 meanfootprints.append(meanfoot) # stdfootprints.append(stdfoot) stdfootprints.append(None) chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid]) controlobj = loadutils.ChipSeq('Gm12878', loadutils.controlmap[pwmid]) chipreads = chipobj.get_total_reads(locs_tolearn, width=200) controlreads = controlobj.get_total_reads(locs_tolearn, width=200) chipobj.close() controlobj.close() pdb.set_trace() # sequence = loadutils.Sequence(sample) # seqs = sequence.get_sequences(locs_tolearn, width=200) # sequence.close() # pdb.set_trace() # np.savez('tostudy.npz', seq=np.array(seqs), dnase=dnasereads, chip=chipreads) # pdb.set_trace() corrC = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(chipreads)) corrD = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(controlreads)) handle = open( "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_short_%s_%s.pkl" % (pwmid, sample), 'r') output = cPickle.load(handle) handle.close() footprint = output['footprint'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:, 1] / posterior[:, 0]) logodds[logodds == np.inf] = logodds[logodds != np.inf].max() logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min() Logodds.append(logodds) meanfootprints.append(footprint) stdfootprints.append(None) handle = open( "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_damped_short_%s_%s.pkl" % (pwmid, sample), 'r') output = cPickle.load(handle) handle.close() footprint = output['footprint'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:, 1] / posterior[:, 0]) logodds[logodds == np.inf] = logodds[logodds != np.inf].max() logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min() Logodds.append(logodds) meanfootprints.append(footprint) stdfootprints.append(None) handle = open( "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_nofoot_short_%s_%s.pkl" % (pwmid, sample), 'r') output = cPickle.load(handle) handle.close() posterior = output['posterior'][0] logodds = np.log(posterior[:, 1] / posterior[:, 0]) logodds[logodds == np.inf] = logodds[logodds != np.inf].max() logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min() Logodds.append(logodds) key = [k for k, pwm in pwms.iteritems() if pwm['AC'] == pwmid][0] if sample is None: title = pwms[key]['NA'] footprintfile = "%s/fig/footprint_short_%s.pdf" % (projpath, pwmid) corrfile = "%s/fig/logoddsCorr_short_%s.pdf" % (projpath, pwmid) else: title = "%s / %s" % (pwms[key]['NA'], sample) footprintfile = "%s/fig/footprint_short_%s_%s.pdf" % (projpath, pwmid, sample) corrfile = "%s/fig/logoddsCorr_short_%s_%s.pdf" % (projpath, pwmid, sample) models = [ 'CentipedePBM_M1', 'CentipedePBM_M2', 'Centipede', 'CentipedeDamped' ] # plot footprints pdfhandle = PdfPages(footprintfile) figure = viz.plot_footprint(meanfootprints, labels=models, stderr=stdfootprints, motif=pwms[key]['motif'], title=title) pdfhandle.savefig(figure) models.append('CentipedeNoFoot') auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, Logodds[0], macs, locs_tolearn) figure = viz.plot_auc(Logodds, positive, negative, labels=models, title=title) pdfhandle.savefig(figure) T = pis[0].size figure = viz.plot.figure() subplot = figure.add_subplot(111) subplot.scatter(gammas[0].value[0], gammas[1].value[0], s=2**T, marker='o', color=viz.colors[1], label='gamma', alpha=0.5) subplot.scatter(pis[0][0], pis[1][0], s=2**T, marker='o', color=viz.colors[0], label='pi', alpha=0.5) for i in xrange(1, T): subplot.scatter(gammas[0].value[i], gammas[1].value[i], s=2**(T - i), marker='o', color=viz.colors[1], label='_nolabel_', alpha=0.5) subplot.scatter(pis[0][i], pis[1][i], s=2**(T - i), marker='o', color=viz.colors[0], label='_nolabel_', alpha=0.5) xmin = min([pis[0].min(), pis[1].min()]) - 0.05 xmax = max([pis[0].max(), pis[1].max()]) + 0.05 subplot.axis([xmin, xmax, xmin, xmax]) subplot.set_xlabel('PBM_M1') subplot.set_ylabel('PBM_M2') legend = subplot.legend(loc=1) for text in legend.texts: text.set_fontsize('8') legend.set_frame_on(False) pdfhandle.savefig(figure) pdfhandle.close() pdb.set_trace() pdfhandle = PdfPages(corrfile) lo = 0 for logodds, model in zip(Logodds, models): auc, tpr, positive, negative = compute_chip_auc( chipreads, controlreads, logodds, macs, locs_tolearn) corrA = stats.pearsonr(logodds, np.sqrt(chipreads)) corrB = stats.pearsonr(logodds, np.sqrt(controlreads)) corra = stats.pearsonr(logodds[logodds > lo], np.sqrt(chipreads)[logodds > lo]) corrb = stats.pearsonr(logodds[logodds > lo], np.sqrt(controlreads)[logodds > lo]) corrc = stats.pearsonr( np.sqrt(dnasereads)[logodds > lo], np.sqrt(chipreads)[logodds > lo]) corrd = stats.pearsonr( np.sqrt(dnasereads)[logodds > lo], np.sqrt(controlreads)[logodds > lo]) towrite = [ pwmid, model, corrA, corrB, corrC, corrD, corra, corrb, corrc, corrd, auc, tpr, logodds.size, (logodds > np.log(99)).sum() ] outhandle.write(' '.join(map(str, towrite)) + '\n') figure = viz.plot_correlation(np.sqrt(chipreads), logodds, title=model) pdfhandle.savefig(figure) figure = viz.plot_correlation(np.sqrt(chipreads), np.sqrt(dnasereads), xlabel='sqrt(dnase reads)', title='Total Dnase reads') pdfhandle.savefig(figure) pdfhandle.close() outhandle.close()
def plotmodel(pwmid, sample=None, pwmbase='transfac'): import centipede_pbm as centipede from matplotlib.backends.backend_pdf import PdfPages if pwmbase=='transfac': pwms = loadutils.transfac_pwms() elif pwmbase=='selex': pwms = loadutils.selex_pwms() models = ['modelA','modelB'] meanfootprints = [] stdfootprints = [] Logodds = [] handle = open('/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed'%loadutils.factormap[pwmid],'r') calls = [line.strip().split()[:3] for line in handle] handle.close() macs = dict([(chrom,[]) for chrom in utils.chromosomes[:22]]) [macs[call[0]].append([int(call[1]),int(call[2])]) for call in calls if call[0] in utils.chromosomes[:22]] if sample is None: statsfile = "%s/fig/stats_short_%s.txt"%(projpath,pwmid) else: statsfile = "%s/fig/stats_short_%s_%s.txt"%(projpath,pwmid,sample) pis = [] gammas = [] outhandle = open(statsfile,'w') for model in models: if sample is None: handle = open("%s/cache/combined/pbmcentipede_%s_short_%s.pkl"%(projpath,model,pwmid),'r') else: handle = open("%s/cache/separate/pbmcentipede_%s_short_%s_%s.pkl"%(projpath,model,pwmid,sample),'r') output = cPickle.load(handle) handle.close() footparams = output['footprint'][0] alpha, tau = output['negbin'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:,1]/posterior[:,0]) logodds[logodds==np.inf] = logodds[logodds!=np.inf].max() logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min() Logodds.append(logodds) means = alpha*(1-tau)/tau outhandle.write('%.2f %.2f\n'%(means[0],means[1])) if not 'cascade' in locals(): locs_tolearn = output['locations'] dnaseobj = loadutils.Dnase(sample=sample) dnasereads, ig, ig = dnaseobj.getreads(locs_tolearn, width=max([200,L/2])) if L<400: reads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4])) else: reads = dnasereads dnasereads = dnasereads.sum(1) dnaseobj.close() cascade = centipede.Cascade(L) cascade.setreads(reads) del reads if model=='modelA': gammas.append(footparams[0]) if isinstance(footparams[1],centipede.Pi): pi = footparams[1].estim else: pi = footparams[1] pis.append(pi) B = footparams[2] M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, B=B, model=model) meanfoot = M1.inverse_transform() stdfoot = (M2.inverse_transform()-meanfoot**2)**0.5 meanfootprints.append(meanfoot) # stdfootprints.append(stdfoot) stdfootprints.append(None) elif model=='modelB': gammas.append(footparams[1]) if isinstance(footparams[2],centipede.Pi): pi = footparams[2].estim else: pi = footparams[2] pis.append(pi) mu = footparams[3] M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, mu=mu, model=model) meanfoot = M1.inverse_transform() stdfoot = (M2.inverse_transform()-meanfoot**2)**0.5 meanfootprints.append(meanfoot) # stdfootprints.append(stdfoot) stdfootprints.append(None) chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid]) controlobj = loadutils.ChipSeq('Gm12878',loadutils.controlmap[pwmid]) chipreads = chipobj.get_total_reads(locs_tolearn, width=200) controlreads = controlobj.get_total_reads(locs_tolearn, width=200) chipobj.close() controlobj.close() pdb.set_trace() # sequence = loadutils.Sequence(sample) # seqs = sequence.get_sequences(locs_tolearn, width=200) # sequence.close() # pdb.set_trace() # np.savez('tostudy.npz', seq=np.array(seqs), dnase=dnasereads, chip=chipreads) # pdb.set_trace() corrC = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(chipreads)) corrD = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(controlreads)) handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_short_%s_%s.pkl"%(pwmid,sample),'r') output = cPickle.load(handle) handle.close() footprint = output['footprint'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:,1]/posterior[:,0]) logodds[logodds==np.inf] = logodds[logodds!=np.inf].max() logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min() Logodds.append(logodds) meanfootprints.append(footprint) stdfootprints.append(None) handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_damped_short_%s_%s.pkl"%(pwmid,sample),'r') output = cPickle.load(handle) handle.close() footprint = output['footprint'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:,1]/posterior[:,0]) logodds[logodds==np.inf] = logodds[logodds!=np.inf].max() logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min() Logodds.append(logodds) meanfootprints.append(footprint) stdfootprints.append(None) handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_nofoot_short_%s_%s.pkl"%(pwmid,sample),'r') output = cPickle.load(handle) handle.close() posterior = output['posterior'][0] logodds = np.log(posterior[:,1]/posterior[:,0]) logodds[logodds==np.inf] = logodds[logodds!=np.inf].max() logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min() Logodds.append(logodds) key = [k for k,pwm in pwms.iteritems() if pwm['AC']==pwmid][0] if sample is None: title = pwms[key]['NA'] footprintfile = "%s/fig/footprint_short_%s.pdf"%(projpath,pwmid) corrfile = "%s/fig/logoddsCorr_short_%s.pdf"%(projpath,pwmid) else: title = "%s / %s"%(pwms[key]['NA'], sample) footprintfile = "%s/fig/footprint_short_%s_%s.pdf"%(projpath,pwmid,sample) corrfile = "%s/fig/logoddsCorr_short_%s_%s.pdf"%(projpath,pwmid,sample) models = ['CentipedePBM_M1','CentipedePBM_M2','Centipede','CentipedeDamped'] # plot footprints pdfhandle = PdfPages(footprintfile) figure = viz.plot_footprint(meanfootprints, labels=models, stderr=stdfootprints, motif=pwms[key]['motif'], title=title) pdfhandle.savefig(figure) models.append('CentipedeNoFoot') auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, Logodds[0], macs, locs_tolearn) figure = viz.plot_auc(Logodds, positive, negative, labels=models, title=title) pdfhandle.savefig(figure) T = pis[0].size figure = viz.plot.figure() subplot = figure.add_subplot(111) subplot.scatter(gammas[0].value[0], gammas[1].value[0], s=2**T, marker='o', color=viz.colors[1], label='gamma', alpha=0.5) subplot.scatter(pis[0][0], pis[1][0], s=2**T, marker='o', color=viz.colors[0], label='pi', alpha=0.5) for i in xrange(1,T): subplot.scatter(gammas[0].value[i], gammas[1].value[i], s=2**(T-i), marker='o', color=viz.colors[1], label='_nolabel_', alpha=0.5) subplot.scatter(pis[0][i], pis[1][i], s=2**(T-i), marker='o', color=viz.colors[0], label='_nolabel_', alpha=0.5) xmin = min([pis[0].min(), pis[1].min()])-0.05 xmax = max([pis[0].max(), pis[1].max()])+0.05 subplot.axis([xmin, xmax, xmin, xmax]) subplot.set_xlabel('PBM_M1') subplot.set_ylabel('PBM_M2') legend = subplot.legend(loc=1) for text in legend.texts: text.set_fontsize('8') legend.set_frame_on(False) pdfhandle.savefig(figure) pdfhandle.close() pdb.set_trace() pdfhandle = PdfPages(corrfile) lo = 0 for logodds,model in zip(Logodds,models): auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, logodds, macs, locs_tolearn) corrA = stats.pearsonr(logodds, np.sqrt(chipreads)) corrB = stats.pearsonr(logodds, np.sqrt(controlreads)) corra = stats.pearsonr(logodds[logodds>lo], np.sqrt(chipreads)[logodds>lo]) corrb = stats.pearsonr(logodds[logodds>lo], np.sqrt(controlreads)[logodds>lo]) corrc = stats.pearsonr(np.sqrt(dnasereads)[logodds>lo], np.sqrt(chipreads)[logodds>lo]) corrd = stats.pearsonr(np.sqrt(dnasereads)[logodds>lo], np.sqrt(controlreads)[logodds>lo]) towrite = [pwmid, model, corrA, corrB, corrC, corrD, corra, corrb, corrc, corrd, auc, tpr, logodds.size, (logodds>np.log(99)).sum()] outhandle.write(' '.join(map(str,towrite))+'\n') figure = viz.plot_correlation(np.sqrt(chipreads), logodds, title=model) pdfhandle.savefig(figure) figure = viz.plot_correlation(np.sqrt(chipreads), np.sqrt(dnasereads), xlabel='sqrt(dnase reads)', title='Total Dnase reads') pdfhandle.savefig(figure) pdfhandle.close() outhandle.close()