return None, maxgamma, maxpi, parameters if __name__=="__main__": pwmid = sys.argv[2] sample = 'NA18505' model = sys.argv[1] location_file = "/mnt/lustre/home/anilraj/pbm_dnase_profile/cache/%s_0_short_bound.bed.gz"%(pwmid) handle = loadutils.ZipFile(location_file) locations = handle.read(threshold=11) print pwmid, sample, model print "read in locations ..." if pwmid[0]=='M': pwms = loadutils.transfac_pwms() elif pwmid[0]=='S': pwms = loadutils.selex_pwms() motif = [val['motif'] for val in pwms.itervalues() if val['AC']==pwmid][0] print "selected motif model ..." bound = [loc for loc in locations if int(loc[-1])>50] undecided = [loc for loc in locations if int(loc[-1])>0] dnaseobj = loadutils.Dnase(sample=sample) reads, undecided, ig = dnaseobj.getreads(undecided, remove_outliers=False, width=200) totalreads = reads.sum(1) print "extracted total reads ..." handle = PdfPages('/mnt/lustre/home/anilraj/pbm_dnase_profile/fig/compare_models_%s_%s.pdf'%(model,pwmid)) for width in [64,128,256]: boundreads, ig, ig = dnaseobj.getreads(bound, remove_outliers=True, width=width)
def infer(pwmid, sample, pwm_thresh=8, pwmbase='transfac', chipseq=False): import centipede_pbm as centipede model = 'modelC' if pwmbase == 'transfac': pwms = loadutils.transfac_pwms() elif pwmbase == 'selex': pwms = loadutils.selex_pwms() motif = [val['motif'] for val in pwms.itervalues() if val['AC'] == pwmid][0] if sample in [None, 'Gm12878', 'Gm12878All']: sequence = loadutils.Sequence(sample) else: indiv_idx = loadutils.read_individuals() sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample]) if sample in ['Gm12878', 'Gm12878All']: location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locationsGm12878_Q%.1f.txt.gz" % ( pwmid, dhs) else: location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locations_Q%.1f.txt.gz" % ( pwmid, dhs) # check file size pipe = subprocess.Popen("zcat %s | wc -l" % location_file, stdout=subprocess.PIPE, shell=True) Ns = int(pipe.communicate()[0].strip()) # load scores alllocations = [] pwm_cutoff = pwm_thresh + 1 while len(alllocations) < 100: pwm_cutoff = pwm_cutoff - 1 handle = loadutils.ZipFile(location_file) alllocations = handle.read(threshold=pwm_cutoff) handle.close() print "PWM Cutoff = %d" % pwm_cutoff # subsample locations, if too many if len(alllocations) > 100000: scores = np.array([loc[-1] for loc in alllocations]).astype(float) indices = np.argsort(scores)[-100000:] alllocations = [alllocations[index] for index in indices] print "Num of sites for learning, with pwm threshold of %d for %s = %d" % ( pwm_thresh, pwmid, len(alllocations)) if sample in [None, 'Gm12878', 'Gm12878All']: locs_tolearn = alllocations else: # compute scores for specific sample at these locations starttime = time.time() locs_tolearn = sequence.get_scores(alllocations, motif) print len(locs_tolearn), time.time() - starttime # filter mappability print "filtering out unmappable sites ..." locs_tolearn = sequence.filter_mappability(locs_tolearn, width=max([200, L / 2])) # load reads and locations print "loading dnase reads ..." readobj = loadutils.Dnase(sample=sample) dnasereads, locs_tolearn, subscores = readobj.getreads( locs_tolearn, remove_outliers=True, width=max([200, L / 2])) subscores = np.array(subscores) subscores = subscores.reshape(subscores.size, 1) dnasetotal = dnasereads.sum(1) print "Num of mappable sites for learning for %s = %d" % ( pwmid, len(locs_tolearn)) if chipseq: chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid]) chipreads = chipobj.get_total_reads(locs_tolearn, width=200) chipobj.close() else: chipreads = None if L < 400: dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4], dnasereads[:, 300 - L / 4:300 + L / 4])) locs_tolearn = [list(loc) for loc in locs_tolearn] footprints = [] priors = [] negbins = [] posteriors = [] null = np.ones((1, L), dtype=float) * 1. / L posterior, footprint, negbinparams, prior = centipede.EM(dnasereads, dnasetotal, subscores, null, model=model, restarts=2) posteriors.append(posterior) footprints.append(footprint) negbins.append(negbinparams) priors.append(prior) chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid]) controlobj = loadutils.ChipSeq('Gm12878', loadutils.controlmap[pwmid]) chipreads = chipobj.get_total_reads(locs_tolearn, width=400) controlreads = controlobj.get_total_reads(locs_tolearn, width=200) chipobj.close() controlobj.close() for posterior in posteriors: logodds = np.log(posterior[:, 1] / posterior[:, 0]) logodds[logodds == np.inf] = logodds[logodds != np.inf].max() logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min() R = stats.pearsonr(logodds, np.sqrt(chipreads)) R2 = stats.pearsonr(np.sqrt(dnasetotal), np.sqrt(chipreads)) handle = open( '/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed' % loadutils.factormap[pwmid], 'r') calls = [line.strip().split()[:3] for line in handle] handle.close() macs = dict([(chrom, []) for chrom in utils.chromosomes[:22]]) [ macs[call[0]].append([int(call[1]), int(call[2])]) for call in calls if call[0] in utils.chromosomes[:22] ] bsites = [ locs_tolearn[i] for i, p in enumerate(posterior[:, 1]) if p > 0.99 ] F, precision, sensitivity, ig = Fscore.Fscore(bsites, macs) chipauc, tpr, positive, negative = compute_chip_auc( chipreads, controlreads, logodds, macs, locs_tolearn) print pwmid, model, sample, R, R2, chipauc, tpr, F, precision, sensitivity output = {'footprint': footprints, \ 'negbin': negbins, \ 'prior': priors, \ 'posterior': posteriors, \ 'locations': locs_tolearn} if sample is None: handle = open( "%s/cache/combined/pbmcentipede_%s_%s.pkl" % (projpath, model, pwmid), 'w') else: handle = open( "%s/cache/separate/pbmcentipede_%s_%s_%s.pkl" % (projpath, model, pwmid, sample), 'w') cPickle.Pickler(handle, protocol=2).dump(output) handle.close() readobj.close() sequence.close()
def decode(pwmid, sample, cutk=0, pwmbase='transfac', pos_threshold=np.log10(99), chipseq=False): import centipede import millipede import centipede_pbm as pbmcentipede if sample in [None, 'Gm12878', 'Gm12878All']: sequence = loadutils.Sequence(sample) else: indiv_idx = loadutils.read_individuals() if pwmbase == 'transfac': pwms = loadutils.transfac_pwms() elif pwmbase == 'selex': pwms = loadutils.selex_pwms() motif = [ val['motif'] for val in pwms.itervalues() if val['AC'] == pwmid ][0] sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample]) if cutk != 0: sequence.set_cutrate(sample=sample, k=cutk) # use output from Centipede run # 0 = Py code, 1 = R code if sample is None: handle = open( "%s/cache/combined/pbmcentipede_short_%s.pkl" % (projpath, pwmid), 'r') else: handle = open( "%s/cache/separate/pbmcentipede_short_%s_%s.pkl" % (projpath, pwmid, sample), 'r') output = cPickle.load(handle) handle.close() if cutk == 0: idx = 0 elif cutk == 2: idx = 1 elif cutk == 4: idx = 2 footprint = output['footprint'][idx] negbinparams = output['negbin'][idx] prior = output['prior'][idx][0] dhsprior = output['prior'][idx][1] if sample in ['Gm12878', 'Gm12878All']: location_file = "%s/cache/%s_locationsGm12878_Q%.1f.txt.gz" % ( projpath, pwmid, dhs) else: location_file = "%s/cache/%s_locations_Q%.1f.txt.gz" % (projpath, pwmid, dhs) # check file size pipe = subprocess.Popen("zcat %s | wc -l" % location_file, stdout=subprocess.PIPE, shell=True) Ns = int(pipe.communicate()[0].strip()) try: chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid]) except: pass readobj = loadutils.Dnase(sample=sample) readhandle = loadutils.ZipFile(location_file) loops = Ns / batch if sample is None: handle = gzip.open( "%s/cache/combined/%s_short_bound.bed.gz" % (projpath, pwmid), 'wb') else: handle = gzip.open( "%s/cache/separate/%s_%d_%s_short_bound.bed.gz" % (projpath, pwmid, cutk, sample), 'wb') towrite = [ 'Chr', 'Start', 'Stop', 'Strand', 'PwmScore', 'LogPosOdds', 'LogPriorOdds', 'MultLikeRatio', 'NegBinLikeRatio', 'ChipseqReads' ] handle.write('\t'.join(towrite) + '\n') totalreads = [] for n in xrange(loops): starttime = time.time() # read locations from file locations = readhandle.read(chunk=batch) if sample not in [None, 'Gm12878', 'Gm12878All']: # compute scores at locations for specific sample locations = sequence.get_scores(locations, motif) locations = sequence.filter_mappability(locations, width=max([200, L / 2])) # read in Dnase read data for locations dnasereads, locations, subscores = readobj.getreads(locations, width=max( [200, L / 2])) subscores = np.array(subscores).astype('float') subscores = subscores.reshape(subscores.size, 1) dnasetotal = dnasereads.sum(1) print len(locations) if chipseq: chipreads = chipobj.getreads(locations, width=max([200, L / 2])) else: chipreads = None # set null footprint distribution if cutk == 0: null = np.ones((1, L), dtype=float) / L else: null = sequence.getnull(locations, width=L / 2) if L < 400: dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4], dnasereads[:, 300 - L / 4:300 + L / 4])) # if cutk==0: logodds = centipede.decode(dnasereads, dnasetotal, null, subscores, footprint, negbinparams[0], negbinparams[1], prior, dhsprior, chipreads=chipreads, damp=damp) # elif cutk==2: # posterior = pbmcentipede.decode(reads, chipreads, subscores, footprint[1:], negbinparams[0], negbinparams[1], prior) if not chipseq: try: chipreads = chipobj.get_total_reads(locations, width=400) ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \ for loc,pos,c in zip(locations,logodds,chipreads)] except NameError: ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \ for loc,pos in zip(locations,logodds)] locations = [loc for loc in locations if len(loc) > 5] ignore = [handle.write('\t'.join(elem) + '\n') for elem in locations] print time.time() - starttime remain = Ns - loops * batch locations = readhandle.read(chunk=remain) if sample not in [None, 'Gm12878', 'Gm12878All']: # compute scores at locations for specific sample locations = sequence.get_scores(locations, motif) locations = sequence.filter_mappability(locations, width=max([200, L / 2])) dnasereads, locations, subscores = readobj.getreads(locations, width=max([200, L / 2])) subscores = np.array(subscores) subscores = subscores.reshape(subscores.size, 1) dnasetotal = dnasereads.sum(1) if chipseq: chipreads = chipobj.get_total_reads(locations, width=200) else: chipreads = None # set null footprint distribution if cutk == 0: null = np.ones((1, L), dtype=float) / L else: null = sequence.getnull(locations, width=L / 2) if L < 400: dnasereads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4], dnasereads[:, 300 - L / 4:300 + L / 4])) logodds = centipede.decode(dnasereads, dnasetotal, null, subscores, footprint, negbinparams[0], negbinparams[1], prior, dhsprior, chipreads=chipreads, damp=damp) if not chipseq: try: chipreads = chipobj.get_total_reads(locations, width=400) ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \ for loc,pos,c in zip(locations,logodds,chipreads)] except NameError: ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \ for loc,pos in zip(locations,logodds)] locations = [loc for loc in locations if len(loc) > 5] ignore = [handle.write('\t'.join(elem) + '\n') for elem in locations] readobj.close() chipobj.close() readhandle.close() handle.close() sequence.close()
def plotmodel(pwmid, sample=None, pwmbase='transfac'): import centipede_pbm as centipede from matplotlib.backends.backend_pdf import PdfPages if pwmbase == 'transfac': pwms = loadutils.transfac_pwms() elif pwmbase == 'selex': pwms = loadutils.selex_pwms() models = ['modelA', 'modelB'] meanfootprints = [] stdfootprints = [] Logodds = [] handle = open( '/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed' % loadutils.factormap[pwmid], 'r') calls = [line.strip().split()[:3] for line in handle] handle.close() macs = dict([(chrom, []) for chrom in utils.chromosomes[:22]]) [ macs[call[0]].append([int(call[1]), int(call[2])]) for call in calls if call[0] in utils.chromosomes[:22] ] if sample is None: statsfile = "%s/fig/stats_short_%s.txt" % (projpath, pwmid) else: statsfile = "%s/fig/stats_short_%s_%s.txt" % (projpath, pwmid, sample) pis = [] gammas = [] outhandle = open(statsfile, 'w') for model in models: if sample is None: handle = open( "%s/cache/combined/pbmcentipede_%s_short_%s.pkl" % (projpath, model, pwmid), 'r') else: handle = open( "%s/cache/separate/pbmcentipede_%s_short_%s_%s.pkl" % (projpath, model, pwmid, sample), 'r') output = cPickle.load(handle) handle.close() footparams = output['footprint'][0] alpha, tau = output['negbin'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:, 1] / posterior[:, 0]) logodds[logodds == np.inf] = logodds[logodds != np.inf].max() logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min() Logodds.append(logodds) means = alpha * (1 - tau) / tau outhandle.write('%.2f %.2f\n' % (means[0], means[1])) if not 'cascade' in locals(): locs_tolearn = output['locations'] dnaseobj = loadutils.Dnase(sample=sample) dnasereads, ig, ig = dnaseobj.getreads(locs_tolearn, width=max([200, L / 2])) if L < 400: reads = np.hstack((dnasereads[:, 100 - L / 4:100 + L / 4], dnasereads[:, 300 - L / 4:300 + L / 4])) else: reads = dnasereads dnasereads = dnasereads.sum(1) dnaseobj.close() cascade = centipede.Cascade(L) cascade.setreads(reads) del reads if model == 'modelA': gammas.append(footparams[0]) if isinstance(footparams[1], centipede.Pi): pi = footparams[1].estim else: pi = footparams[1] pis.append(pi) B = footparams[2] M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, B=B, model=model) meanfoot = M1.inverse_transform() stdfoot = (M2.inverse_transform() - meanfoot**2)**0.5 meanfootprints.append(meanfoot) # stdfootprints.append(stdfoot) stdfootprints.append(None) elif model == 'modelB': gammas.append(footparams[1]) if isinstance(footparams[2], centipede.Pi): pi = footparams[2].estim else: pi = footparams[2] pis.append(pi) mu = footparams[3] M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, mu=mu, model=model) meanfoot = M1.inverse_transform() stdfoot = (M2.inverse_transform() - meanfoot**2)**0.5 meanfootprints.append(meanfoot) # stdfootprints.append(stdfoot) stdfootprints.append(None) chipobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid]) controlobj = loadutils.ChipSeq('Gm12878', loadutils.controlmap[pwmid]) chipreads = chipobj.get_total_reads(locs_tolearn, width=200) controlreads = controlobj.get_total_reads(locs_tolearn, width=200) chipobj.close() controlobj.close() pdb.set_trace() # sequence = loadutils.Sequence(sample) # seqs = sequence.get_sequences(locs_tolearn, width=200) # sequence.close() # pdb.set_trace() # np.savez('tostudy.npz', seq=np.array(seqs), dnase=dnasereads, chip=chipreads) # pdb.set_trace() corrC = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(chipreads)) corrD = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(controlreads)) handle = open( "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_short_%s_%s.pkl" % (pwmid, sample), 'r') output = cPickle.load(handle) handle.close() footprint = output['footprint'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:, 1] / posterior[:, 0]) logodds[logodds == np.inf] = logodds[logodds != np.inf].max() logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min() Logodds.append(logodds) meanfootprints.append(footprint) stdfootprints.append(None) handle = open( "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_damped_short_%s_%s.pkl" % (pwmid, sample), 'r') output = cPickle.load(handle) handle.close() footprint = output['footprint'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:, 1] / posterior[:, 0]) logodds[logodds == np.inf] = logodds[logodds != np.inf].max() logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min() Logodds.append(logodds) meanfootprints.append(footprint) stdfootprints.append(None) handle = open( "/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_nofoot_short_%s_%s.pkl" % (pwmid, sample), 'r') output = cPickle.load(handle) handle.close() posterior = output['posterior'][0] logodds = np.log(posterior[:, 1] / posterior[:, 0]) logodds[logodds == np.inf] = logodds[logodds != np.inf].max() logodds[logodds == -np.inf] = logodds[logodds != -np.inf].min() Logodds.append(logodds) key = [k for k, pwm in pwms.iteritems() if pwm['AC'] == pwmid][0] if sample is None: title = pwms[key]['NA'] footprintfile = "%s/fig/footprint_short_%s.pdf" % (projpath, pwmid) corrfile = "%s/fig/logoddsCorr_short_%s.pdf" % (projpath, pwmid) else: title = "%s / %s" % (pwms[key]['NA'], sample) footprintfile = "%s/fig/footprint_short_%s_%s.pdf" % (projpath, pwmid, sample) corrfile = "%s/fig/logoddsCorr_short_%s_%s.pdf" % (projpath, pwmid, sample) models = [ 'CentipedePBM_M1', 'CentipedePBM_M2', 'Centipede', 'CentipedeDamped' ] # plot footprints pdfhandle = PdfPages(footprintfile) figure = viz.plot_footprint(meanfootprints, labels=models, stderr=stdfootprints, motif=pwms[key]['motif'], title=title) pdfhandle.savefig(figure) models.append('CentipedeNoFoot') auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, Logodds[0], macs, locs_tolearn) figure = viz.plot_auc(Logodds, positive, negative, labels=models, title=title) pdfhandle.savefig(figure) T = pis[0].size figure = viz.plot.figure() subplot = figure.add_subplot(111) subplot.scatter(gammas[0].value[0], gammas[1].value[0], s=2**T, marker='o', color=viz.colors[1], label='gamma', alpha=0.5) subplot.scatter(pis[0][0], pis[1][0], s=2**T, marker='o', color=viz.colors[0], label='pi', alpha=0.5) for i in xrange(1, T): subplot.scatter(gammas[0].value[i], gammas[1].value[i], s=2**(T - i), marker='o', color=viz.colors[1], label='_nolabel_', alpha=0.5) subplot.scatter(pis[0][i], pis[1][i], s=2**(T - i), marker='o', color=viz.colors[0], label='_nolabel_', alpha=0.5) xmin = min([pis[0].min(), pis[1].min()]) - 0.05 xmax = max([pis[0].max(), pis[1].max()]) + 0.05 subplot.axis([xmin, xmax, xmin, xmax]) subplot.set_xlabel('PBM_M1') subplot.set_ylabel('PBM_M2') legend = subplot.legend(loc=1) for text in legend.texts: text.set_fontsize('8') legend.set_frame_on(False) pdfhandle.savefig(figure) pdfhandle.close() pdb.set_trace() pdfhandle = PdfPages(corrfile) lo = 0 for logodds, model in zip(Logodds, models): auc, tpr, positive, negative = compute_chip_auc( chipreads, controlreads, logodds, macs, locs_tolearn) corrA = stats.pearsonr(logodds, np.sqrt(chipreads)) corrB = stats.pearsonr(logodds, np.sqrt(controlreads)) corra = stats.pearsonr(logodds[logodds > lo], np.sqrt(chipreads)[logodds > lo]) corrb = stats.pearsonr(logodds[logodds > lo], np.sqrt(controlreads)[logodds > lo]) corrc = stats.pearsonr( np.sqrt(dnasereads)[logodds > lo], np.sqrt(chipreads)[logodds > lo]) corrd = stats.pearsonr( np.sqrt(dnasereads)[logodds > lo], np.sqrt(controlreads)[logodds > lo]) towrite = [ pwmid, model, corrA, corrB, corrC, corrD, corra, corrb, corrc, corrd, auc, tpr, logodds.size, (logodds > np.log(99)).sum() ] outhandle.write(' '.join(map(str, towrite)) + '\n') figure = viz.plot_correlation(np.sqrt(chipreads), logodds, title=model) pdfhandle.savefig(figure) figure = viz.plot_correlation(np.sqrt(chipreads), np.sqrt(dnasereads), xlabel='sqrt(dnase reads)', title='Total Dnase reads') pdfhandle.savefig(figure) pdfhandle.close() outhandle.close()
def plotbound(pwmid, sample=None, cutk=0, pwmbase='transfac'): import random from matplotlib.backends.backend_pdf import PdfPages bounds = [(1, 5), (5, 9), (9, 13), (13, np.inf)] labels = ['1 - 5', '5 - 9', '9 - 13', '>13'] if pwmbase == 'transfac': pwms = loadutils.transfac_pwms() elif pwmbase == 'selex': pwms = loadutils.selex_pwms() dnaseobj = loadutils.Dnase(sample=sample) chipseqobj = loadutils.ChipSeq('Gm12878', loadutils.factormap[pwmid]) mnaseobj = loadutils.Mnase(sample=sample) indiv_idx = loadutils.read_individuals() if sample in [None, 'Gm12878']: sequence = loadutils.Sequence(sample, sample_idx=indiv_idx['NA18516']) else: sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample]) key = [k for k, pwm in pwms.iteritems() if pwm['AC'] == pwmid][0] bound_scores = [] bound_chipreads = [] unbound_chipreads = [] dnasemean_bound = [] mnasemean_bound = [] chiptotalreads = [] logodds = [] score = [] for bound in bounds: # plot mean profile of all bound sites, stratified by PWM score all_handle = loadutils.ZipFile("%s/cache/%s_locations_Q%.1f.txt.gz" % (projpath, pwmid, 95.0)) if sample is None: bound_handle = loadutils.ZipFile( "%s/cache/combined/%s_%d_bound_Q%.1f.bed.gz" % (projpath, pwmid, cutk, dhs)) else: bound_handle = loadutils.ZipFile( "%s/cache/separate/%s_%d_%s_bound_Q%.1f.bed.gz" % (projpath, pwmid, cutk, sample, dhs)) all_locations = all_handle.read(threshold=bound) blocs = bound_handle.read(threshold=bound) bound_locations = [ loc[:5] for loc in blocs if float(loc[5]) >= np.log10(99) ] if len(all_locations) > 2 * len(bound_locations): all_locations = random.sample(all_locations, 2 * len(bound_locations)) unbound_locations = list( set(all_locations).difference(set(bound_locations))) chiptotalreads.extend([int(loc[-1]) for loc in blocs]) logodds.extend([float(loc[-2]) for loc in blocs]) score.extend([float(loc[-3]) for loc in blocs]) # load DNase and MNase reads print bound, len(bound_locations), len(unbound_locations) x, y = aggregate(bound_locations, dnaseobj) dnasemean_bound.append(x) mnasemean_bound.append(aggregate(bound_locations, mnaseobj)) # Total ChipSeq read counts chipreads = chipseqobj.getreads(bound_locations) bound_chipreads.extend(chipreads) chipreads = chipseqobj.getreads(unbound_locations) unbound_chipreads.extend(chipreads) chiptotalreads = np.array(chiptotalreads) logodds = np.array(logodds) score = np.array(score) if sample is None: title = pwms[key]['NA'] tag = "_%s_%d_Q%.1f.pdf" % (pwmid, cutk, dhs) dnaseprofilefile = "%s/fig/dnaseprofile%s" % (projpath, tag) mnaseprofilefile = "%s/fig/mnaseprofile%s" % (projpath, tag) chipdistfile = "%s/fig/chipdist%s" % (projpath, tag) scatterfile = "%s/fig/scatter%s" % (projpath, tag) scoreposfile = "%s/fig/scoreposition%s" % (projpath, tag) posagreefile = "%s/fig/posagreement%s.pdf" % (projpath, tag) else: title = "%s / %s" % (pwms[key]['NA'], sample) tag = "_short_%s_%d_%s_Q%.1f" % (pwmid, cutk, sample, dhs) dnaseprofilefile = "%s/fig/dnaseprofile%s.pdf" % (projpath, tag) mnaseprofilefile = "%s/fig/mnaseprofile%s.pdf" % (projpath, tag) chipdistfile = "%s/fig/chipdist%s.pdf" % (projpath, tag) scatterfile = "%s/fig/scatter%s.pdf" % (projpath, tag) scoreposfile = "%s/fig/scoreposition%s.pdf" % (projpath, tag) posagreefile = "%s/fig/posagreement%s.pdf" % (projpath, tag) figure = viz.plot_dnaseprofile(dnasemean_bound, labels, motiflen=len(pwms[key]['motif']), title=title) figure.savefig(dnaseprofilefile, dpi=300, format='pdf') figure = viz.plot_mnaseprofile(mnasemean_bound, labels, motiflen=len(pwms[key]['motif']), title=title) figure.savefig(mnaseprofilefile, dpi=300, format='pdf') figure = viz.plot_chipseq_distribution(bound_chipreads, unbound_chipreads, title=title) figure.savefig(chipdistfile, bbox_inches=0, dpi=300, format='pdf') figure = viz.plot_chipseq_posterior_correlation(chiptotalreads, logodds, score, title=title) figure.savefig(scatterfile, bbox_inches=0, dpi=300, format='pdf') dnaseobj.close() mnaseobj.close() chipseqobj.close() sequence.close()
return None, maxgamma, maxpi, parameters if __name__ == "__main__": pwmid = sys.argv[2] sample = "NA18505" model = sys.argv[1] location_file = "/mnt/lustre/home/anilraj/pbm_dnase_profile/cache/%s_0_short_bound.bed.gz" % (pwmid) handle = loadutils.ZipFile(location_file) locations = handle.read(threshold=11) print pwmid, sample, model print "read in locations ..." if pwmid[0] == "M": pwms = loadutils.transfac_pwms() elif pwmid[0] == "S": pwms = loadutils.selex_pwms() motif = [val["motif"] for val in pwms.itervalues() if val["AC"] == pwmid][0] print "selected motif model ..." bound = [loc for loc in locations if int(loc[-1]) > 50] undecided = [loc for loc in locations if int(loc[-1]) > 0] dnaseobj = loadutils.Dnase(sample=sample) reads, undecided, ig = dnaseobj.getreads(undecided, remove_outliers=False, width=200) totalreads = reads.sum(1) print "extracted total reads ..." handle = PdfPages("/mnt/lustre/home/anilraj/pbm_dnase_profile/fig/compare_models_%s_%s.pdf" % (model, pwmid)) for width in [64, 128, 256]: boundreads, ig, ig = dnaseobj.getreads(bound, remove_outliers=True, width=width)
def decode(pwmid, sample, cutk=0, pwmbase='transfac', pos_threshold=np.log10(99), chipseq=False): import centipede import millipede import centipede_pbm as pbmcentipede if sample in [None,'Gm12878','Gm12878All']: sequence = loadutils.Sequence(sample) else: indiv_idx = loadutils.read_individuals() if pwmbase=='transfac': pwms = loadutils.transfac_pwms() elif pwmbase=='selex': pwms = loadutils.selex_pwms() motif = [val['motif'] for val in pwms.itervalues() if val['AC']==pwmid][0] sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample]) if cutk!=0: sequence.set_cutrate(sample=sample, k=cutk) # use output from Centipede run # 0 = Py code, 1 = R code if sample is None: handle = open("%s/cache/combined/pbmcentipede_short_%s.pkl"%(projpath,pwmid),'r') else: handle = open("%s/cache/separate/pbmcentipede_short_%s_%s.pkl"%(projpath,pwmid,sample),'r') output = cPickle.load(handle) handle.close() if cutk==0: idx = 0 elif cutk==2: idx = 1 elif cutk==4: idx = 2 footprint = output['footprint'][idx] negbinparams = output['negbin'][idx] prior = output['prior'][idx][0] dhsprior = output['prior'][idx][1] if sample in ['Gm12878','Gm12878All']: location_file = "%s/cache/%s_locationsGm12878_Q%.1f.txt.gz"%(projpath,pwmid,dhs) else: location_file = "%s/cache/%s_locations_Q%.1f.txt.gz"%(projpath,pwmid,dhs) # check file size pipe = subprocess.Popen("zcat %s | wc -l"%location_file, stdout=subprocess.PIPE, shell=True) Ns = int(pipe.communicate()[0].strip()) try: chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid]) except: pass readobj = loadutils.Dnase(sample=sample) readhandle = loadutils.ZipFile(location_file) loops = Ns/batch if sample is None: handle = gzip.open("%s/cache/combined/%s_short_bound.bed.gz"%(projpath,pwmid),'wb') else: handle = gzip.open("%s/cache/separate/%s_%d_%s_short_bound.bed.gz"%(projpath,pwmid,cutk,sample),'wb') towrite = ['Chr','Start','Stop','Strand','PwmScore','LogPosOdds','LogPriorOdds','MultLikeRatio','NegBinLikeRatio','ChipseqReads'] handle.write('\t'.join(towrite)+'\n') totalreads = [] for n in xrange(loops): starttime = time.time() # read locations from file locations = readhandle.read(chunk=batch) if sample not in [None,'Gm12878','Gm12878All']: # compute scores at locations for specific sample locations = sequence.get_scores(locations, motif) locations = sequence.filter_mappability(locations, width=max([200,L/2])) # read in Dnase read data for locations dnasereads, locations, subscores = readobj.getreads(locations, width=max([200,L/2])) subscores = np.array(subscores).astype('float') subscores = subscores.reshape(subscores.size,1) dnasetotal = dnasereads.sum(1) print len(locations) if chipseq: chipreads = chipobj.getreads(locations, width=max([200,L/2])) else: chipreads = None # set null footprint distribution if cutk==0: null = np.ones((1,L),dtype=float)/L else: null = sequence.getnull(locations, width=L/2) if L<400: dnasereads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4])) # if cutk==0: logodds = centipede.decode(dnasereads, dnasetotal, null, subscores, footprint, negbinparams[0], negbinparams[1], prior, dhsprior, chipreads=chipreads, damp=damp) # elif cutk==2: # posterior = pbmcentipede.decode(reads, chipreads, subscores, footprint[1:], negbinparams[0], negbinparams[1], prior) if not chipseq: try: chipreads = chipobj.get_total_reads(locations, width=400) ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \ for loc,pos,c in zip(locations,logodds,chipreads)] except NameError: ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \ for loc,pos in zip(locations,logodds)] locations = [loc for loc in locations if len(loc)>5] ignore = [handle.write('\t'.join(elem)+'\n') for elem in locations] print time.time()-starttime remain = Ns-loops*batch locations = readhandle.read(chunk=remain) if sample not in [None,'Gm12878','Gm12878All']: # compute scores at locations for specific sample locations = sequence.get_scores(locations, motif) locations = sequence.filter_mappability(locations, width=max([200,L/2])) dnasereads, locations, subscores = readobj.getreads(locations, width=max([200,L/2])) subscores = np.array(subscores) subscores = subscores.reshape(subscores.size,1) dnasetotal = dnasereads.sum(1) if chipseq: chipreads = chipobj.get_total_reads(locations, width=200) else: chipreads = None # set null footprint distribution if cutk==0: null = np.ones((1,L),dtype=float)/L else: null = sequence.getnull(locations, width=L/2) if L<400: dnasereads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4])) logodds = centipede.decode(dnasereads, dnasetotal, null, subscores, footprint, negbinparams[0], negbinparams[1], prior, dhsprior, chipreads=chipreads, damp=damp) if not chipseq: try: chipreads = chipobj.get_total_reads(locations, width=400) ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3],'%d'%c]) \ for loc,pos,c in zip(locations,logodds,chipreads)] except NameError: ignore = [loc.extend(['%.3f'%pos[0],'%.3f'%pos[1],'%.3f'%pos[2],'%.3f'%pos[3]]) \ for loc,pos in zip(locations,logodds)] locations = [loc for loc in locations if len(loc)>5] ignore = [handle.write('\t'.join(elem)+'\n') for elem in locations] readobj.close() chipobj.close() readhandle.close() handle.close() sequence.close()
def infer(pwmid, sample, pwm_thresh=8, pwmbase='transfac', chipseq=False): import centipede_pbm as centipede model = 'modelC' if pwmbase=='transfac': pwms = loadutils.transfac_pwms() elif pwmbase=='selex': pwms = loadutils.selex_pwms() motif = [val['motif'] for val in pwms.itervalues() if val['AC']==pwmid][0] if sample in [None,'Gm12878','Gm12878All']: sequence = loadutils.Sequence(sample) else: indiv_idx = loadutils.read_individuals() sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample]) if sample in ['Gm12878','Gm12878All']: location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locationsGm12878_Q%.1f.txt.gz"%(pwmid,dhs) else: location_file = "/mnt/lustre/home/anilraj/histmod/cache/%s_locations_Q%.1f.txt.gz"%(pwmid,dhs) # check file size pipe = subprocess.Popen("zcat %s | wc -l"%location_file, stdout=subprocess.PIPE, shell=True) Ns = int(pipe.communicate()[0].strip()) # load scores alllocations = [] pwm_cutoff = pwm_thresh+1 while len(alllocations)<100: pwm_cutoff = pwm_cutoff - 1 handle = loadutils.ZipFile(location_file) alllocations = handle.read(threshold=pwm_cutoff) handle.close() print "PWM Cutoff = %d"%pwm_cutoff # subsample locations, if too many if len(alllocations)>100000: scores = np.array([loc[-1] for loc in alllocations]).astype(float) indices = np.argsort(scores)[-100000:] alllocations = [alllocations[index] for index in indices] print "Num of sites for learning, with pwm threshold of %d for %s = %d"%(pwm_thresh, pwmid, len(alllocations)) if sample in [None,'Gm12878','Gm12878All']: locs_tolearn = alllocations else: # compute scores for specific sample at these locations starttime = time.time() locs_tolearn = sequence.get_scores(alllocations, motif) print len(locs_tolearn), time.time()-starttime # filter mappability print "filtering out unmappable sites ..." locs_tolearn = sequence.filter_mappability(locs_tolearn, width=max([200,L/2])) # load reads and locations print "loading dnase reads ..." readobj = loadutils.Dnase(sample=sample) dnasereads, locs_tolearn, subscores = readobj.getreads(locs_tolearn, remove_outliers=True, width=max([200,L/2])) subscores = np.array(subscores) subscores = subscores.reshape(subscores.size,1) dnasetotal = dnasereads.sum(1) print "Num of mappable sites for learning for %s = %d"%(pwmid,len(locs_tolearn)) if chipseq: chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid]) chipreads = chipobj.get_total_reads(locs_tolearn, width=200) chipobj.close() else: chipreads = None if L<400: dnasereads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4])) locs_tolearn = [list(loc) for loc in locs_tolearn] footprints = [] priors = [] negbins = [] posteriors = [] null = np.ones((1,L),dtype=float)*1./L posterior, footprint, negbinparams, prior = centipede.EM(dnasereads, dnasetotal, subscores, null, model=model, restarts=2) posteriors.append(posterior) footprints.append(footprint) negbins.append(negbinparams) priors.append(prior) chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid]) controlobj = loadutils.ChipSeq('Gm12878',loadutils.controlmap[pwmid]) chipreads = chipobj.get_total_reads(locs_tolearn, width=400) controlreads = controlobj.get_total_reads(locs_tolearn, width=200) chipobj.close() controlobj.close() for posterior in posteriors: logodds = np.log(posterior[:,1]/posterior[:,0]) logodds[logodds==np.inf] = logodds[logodds!=np.inf].max() logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min() R = stats.pearsonr(logodds, np.sqrt(chipreads)) R2 = stats.pearsonr(np.sqrt(dnasetotal), np.sqrt(chipreads)) handle = open('/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed'%loadutils.factormap[pwmid],'r') calls = [line.strip().split()[:3] for line in handle] handle.close() macs = dict([(chrom,[]) for chrom in utils.chromosomes[:22]]) [macs[call[0]].append([int(call[1]),int(call[2])]) for call in calls if call[0] in utils.chromosomes[:22]] bsites = [locs_tolearn[i] for i,p in enumerate(posterior[:,1]) if p>0.99] F, precision, sensitivity, ig = Fscore.Fscore(bsites, macs) chipauc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, logodds, macs, locs_tolearn) print pwmid, model, sample, R, R2, chipauc, tpr, F, precision, sensitivity output = {'footprint': footprints, \ 'negbin': negbins, \ 'prior': priors, \ 'posterior': posteriors, \ 'locations': locs_tolearn} if sample is None: handle = open("%s/cache/combined/pbmcentipede_%s_%s.pkl"%(projpath,model,pwmid),'w') else: handle = open("%s/cache/separate/pbmcentipede_%s_%s_%s.pkl"%(projpath,model,pwmid,sample),'w') cPickle.Pickler(handle, protocol=2).dump(output) handle.close() readobj.close() sequence.close()
def plotbound(pwmid, sample=None, cutk=0, pwmbase='transfac'): import random from matplotlib.backends.backend_pdf import PdfPages bounds = [(1,5),(5,9),(9,13),(13,np.inf)] labels = ['1 - 5', '5 - 9', '9 - 13', '>13'] if pwmbase=='transfac': pwms = loadutils.transfac_pwms() elif pwmbase=='selex': pwms = loadutils.selex_pwms() dnaseobj = loadutils.Dnase(sample=sample) chipseqobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid]) mnaseobj = loadutils.Mnase(sample=sample) indiv_idx = loadutils.read_individuals() if sample in [None,'Gm12878']: sequence = loadutils.Sequence(sample, sample_idx=indiv_idx['NA18516']) else: sequence = loadutils.Sequence(sample, sample_idx=indiv_idx[sample]) key = [k for k,pwm in pwms.iteritems() if pwm['AC']==pwmid][0] bound_scores = [] bound_chipreads = [] unbound_chipreads = [] dnasemean_bound = [] mnasemean_bound = [] chiptotalreads = [] logodds = [] score = [] for bound in bounds: # plot mean profile of all bound sites, stratified by PWM score all_handle = loadutils.ZipFile("%s/cache/%s_locations_Q%.1f.txt.gz"%(projpath,pwmid,95.0)) if sample is None: bound_handle = loadutils.ZipFile("%s/cache/combined/%s_%d_bound_Q%.1f.bed.gz"%(projpath,pwmid,cutk,dhs)) else: bound_handle = loadutils.ZipFile("%s/cache/separate/%s_%d_%s_bound_Q%.1f.bed.gz"%(projpath,pwmid,cutk,sample,dhs)) all_locations = all_handle.read(threshold=bound) blocs = bound_handle.read(threshold=bound) bound_locations = [loc[:5] for loc in blocs if float(loc[5])>=np.log10(99)] if len(all_locations)>2*len(bound_locations): all_locations = random.sample(all_locations, 2*len(bound_locations)) unbound_locations = list(set(all_locations).difference(set(bound_locations))) chiptotalreads.extend([int(loc[-1]) for loc in blocs]) logodds.extend([float(loc[-2]) for loc in blocs]) score.extend([float(loc[-3]) for loc in blocs]) # load DNase and MNase reads print bound, len(bound_locations), len(unbound_locations) x,y = aggregate(bound_locations, dnaseobj) dnasemean_bound.append(x) mnasemean_bound.append(aggregate(bound_locations, mnaseobj)) # Total ChipSeq read counts chipreads = chipseqobj.getreads(bound_locations) bound_chipreads.extend(chipreads) chipreads = chipseqobj.getreads(unbound_locations) unbound_chipreads.extend(chipreads) chiptotalreads = np.array(chiptotalreads) logodds = np.array(logodds) score = np.array(score) if sample is None: title = pwms[key]['NA'] tag = "_%s_%d_Q%.1f.pdf"%(pwmid,cutk,dhs) dnaseprofilefile = "%s/fig/dnaseprofile%s"%(projpath,tag) mnaseprofilefile = "%s/fig/mnaseprofile%s"%(projpath,tag) chipdistfile = "%s/fig/chipdist%s"%(projpath,tag) scatterfile = "%s/fig/scatter%s"%(projpath,tag) scoreposfile = "%s/fig/scoreposition%s"%(projpath,tag) posagreefile = "%s/fig/posagreement%s.pdf"%(projpath,tag) else: title = "%s / %s"%(pwms[key]['NA'], sample) tag = "_short_%s_%d_%s_Q%.1f"%(pwmid,cutk,sample,dhs) dnaseprofilefile = "%s/fig/dnaseprofile%s.pdf"%(projpath,tag) mnaseprofilefile = "%s/fig/mnaseprofile%s.pdf"%(projpath,tag) chipdistfile = "%s/fig/chipdist%s.pdf"%(projpath,tag) scatterfile = "%s/fig/scatter%s.pdf"%(projpath,tag) scoreposfile = "%s/fig/scoreposition%s.pdf"%(projpath,tag) posagreefile = "%s/fig/posagreement%s.pdf"%(projpath,tag) figure = viz.plot_dnaseprofile(dnasemean_bound, labels, motiflen=len(pwms[key]['motif']), title=title) figure.savefig(dnaseprofilefile, dpi=300, format='pdf') figure = viz.plot_mnaseprofile(mnasemean_bound, labels, motiflen=len(pwms[key]['motif']), title=title) figure.savefig(mnaseprofilefile, dpi=300, format='pdf') figure = viz.plot_chipseq_distribution(bound_chipreads, unbound_chipreads, title=title) figure.savefig(chipdistfile, bbox_inches=0, dpi=300, format='pdf') figure = viz.plot_chipseq_posterior_correlation(chiptotalreads, logodds, score, title=title) figure.savefig(scatterfile, bbox_inches=0, dpi=300, format='pdf') dnaseobj.close() mnaseobj.close() chipseqobj.close() sequence.close()
def plotmodel(pwmid, sample=None, pwmbase='transfac'): import centipede_pbm as centipede from matplotlib.backends.backend_pdf import PdfPages if pwmbase=='transfac': pwms = loadutils.transfac_pwms() elif pwmbase=='selex': pwms = loadutils.selex_pwms() models = ['modelA','modelB'] meanfootprints = [] stdfootprints = [] Logodds = [] handle = open('/mnt/lustre/home/anilraj/histmod/cache/chipseq_peaks/%s_peaks.bed'%loadutils.factormap[pwmid],'r') calls = [line.strip().split()[:3] for line in handle] handle.close() macs = dict([(chrom,[]) for chrom in utils.chromosomes[:22]]) [macs[call[0]].append([int(call[1]),int(call[2])]) for call in calls if call[0] in utils.chromosomes[:22]] if sample is None: statsfile = "%s/fig/stats_short_%s.txt"%(projpath,pwmid) else: statsfile = "%s/fig/stats_short_%s_%s.txt"%(projpath,pwmid,sample) pis = [] gammas = [] outhandle = open(statsfile,'w') for model in models: if sample is None: handle = open("%s/cache/combined/pbmcentipede_%s_short_%s.pkl"%(projpath,model,pwmid),'r') else: handle = open("%s/cache/separate/pbmcentipede_%s_short_%s_%s.pkl"%(projpath,model,pwmid,sample),'r') output = cPickle.load(handle) handle.close() footparams = output['footprint'][0] alpha, tau = output['negbin'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:,1]/posterior[:,0]) logodds[logodds==np.inf] = logodds[logodds!=np.inf].max() logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min() Logodds.append(logodds) means = alpha*(1-tau)/tau outhandle.write('%.2f %.2f\n'%(means[0],means[1])) if not 'cascade' in locals(): locs_tolearn = output['locations'] dnaseobj = loadutils.Dnase(sample=sample) dnasereads, ig, ig = dnaseobj.getreads(locs_tolearn, width=max([200,L/2])) if L<400: reads = np.hstack((dnasereads[:,100-L/4:100+L/4],dnasereads[:,300-L/4:300+L/4])) else: reads = dnasereads dnasereads = dnasereads.sum(1) dnaseobj.close() cascade = centipede.Cascade(L) cascade.setreads(reads) del reads if model=='modelA': gammas.append(footparams[0]) if isinstance(footparams[1],centipede.Pi): pi = footparams[1].estim else: pi = footparams[1] pis.append(pi) B = footparams[2] M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, B=B, model=model) meanfoot = M1.inverse_transform() stdfoot = (M2.inverse_transform()-meanfoot**2)**0.5 meanfootprints.append(meanfoot) # stdfootprints.append(stdfoot) stdfootprints.append(None) elif model=='modelB': gammas.append(footparams[1]) if isinstance(footparams[2],centipede.Pi): pi = footparams[2].estim else: pi = footparams[2] pis.append(pi) mu = footparams[3] M1, M2 = centipede.bayes_optimal_estimator(cascade, posterior, pi, mu=mu, model=model) meanfoot = M1.inverse_transform() stdfoot = (M2.inverse_transform()-meanfoot**2)**0.5 meanfootprints.append(meanfoot) # stdfootprints.append(stdfoot) stdfootprints.append(None) chipobj = loadutils.ChipSeq('Gm12878',loadutils.factormap[pwmid]) controlobj = loadutils.ChipSeq('Gm12878',loadutils.controlmap[pwmid]) chipreads = chipobj.get_total_reads(locs_tolearn, width=200) controlreads = controlobj.get_total_reads(locs_tolearn, width=200) chipobj.close() controlobj.close() pdb.set_trace() # sequence = loadutils.Sequence(sample) # seqs = sequence.get_sequences(locs_tolearn, width=200) # sequence.close() # pdb.set_trace() # np.savez('tostudy.npz', seq=np.array(seqs), dnase=dnasereads, chip=chipreads) # pdb.set_trace() corrC = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(chipreads)) corrD = stats.pearsonr(np.sqrt(dnasereads), np.sqrt(controlreads)) handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_short_%s_%s.pkl"%(pwmid,sample),'r') output = cPickle.load(handle) handle.close() footprint = output['footprint'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:,1]/posterior[:,0]) logodds[logodds==np.inf] = logodds[logodds!=np.inf].max() logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min() Logodds.append(logodds) meanfootprints.append(footprint) stdfootprints.append(None) handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_damped_short_%s_%s.pkl"%(pwmid,sample),'r') output = cPickle.load(handle) handle.close() footprint = output['footprint'][0] posterior = output['posterior'][0] logodds = np.log(posterior[:,1]/posterior[:,0]) logodds[logodds==np.inf] = logodds[logodds!=np.inf].max() logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min() Logodds.append(logodds) meanfootprints.append(footprint) stdfootprints.append(None) handle = open("/mnt/lustre/home/anilraj/histmod/cache/separate/centipede_nofoot_short_%s_%s.pkl"%(pwmid,sample),'r') output = cPickle.load(handle) handle.close() posterior = output['posterior'][0] logodds = np.log(posterior[:,1]/posterior[:,0]) logodds[logodds==np.inf] = logodds[logodds!=np.inf].max() logodds[logodds==-np.inf] = logodds[logodds!=-np.inf].min() Logodds.append(logodds) key = [k for k,pwm in pwms.iteritems() if pwm['AC']==pwmid][0] if sample is None: title = pwms[key]['NA'] footprintfile = "%s/fig/footprint_short_%s.pdf"%(projpath,pwmid) corrfile = "%s/fig/logoddsCorr_short_%s.pdf"%(projpath,pwmid) else: title = "%s / %s"%(pwms[key]['NA'], sample) footprintfile = "%s/fig/footprint_short_%s_%s.pdf"%(projpath,pwmid,sample) corrfile = "%s/fig/logoddsCorr_short_%s_%s.pdf"%(projpath,pwmid,sample) models = ['CentipedePBM_M1','CentipedePBM_M2','Centipede','CentipedeDamped'] # plot footprints pdfhandle = PdfPages(footprintfile) figure = viz.plot_footprint(meanfootprints, labels=models, stderr=stdfootprints, motif=pwms[key]['motif'], title=title) pdfhandle.savefig(figure) models.append('CentipedeNoFoot') auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, Logodds[0], macs, locs_tolearn) figure = viz.plot_auc(Logodds, positive, negative, labels=models, title=title) pdfhandle.savefig(figure) T = pis[0].size figure = viz.plot.figure() subplot = figure.add_subplot(111) subplot.scatter(gammas[0].value[0], gammas[1].value[0], s=2**T, marker='o', color=viz.colors[1], label='gamma', alpha=0.5) subplot.scatter(pis[0][0], pis[1][0], s=2**T, marker='o', color=viz.colors[0], label='pi', alpha=0.5) for i in xrange(1,T): subplot.scatter(gammas[0].value[i], gammas[1].value[i], s=2**(T-i), marker='o', color=viz.colors[1], label='_nolabel_', alpha=0.5) subplot.scatter(pis[0][i], pis[1][i], s=2**(T-i), marker='o', color=viz.colors[0], label='_nolabel_', alpha=0.5) xmin = min([pis[0].min(), pis[1].min()])-0.05 xmax = max([pis[0].max(), pis[1].max()])+0.05 subplot.axis([xmin, xmax, xmin, xmax]) subplot.set_xlabel('PBM_M1') subplot.set_ylabel('PBM_M2') legend = subplot.legend(loc=1) for text in legend.texts: text.set_fontsize('8') legend.set_frame_on(False) pdfhandle.savefig(figure) pdfhandle.close() pdb.set_trace() pdfhandle = PdfPages(corrfile) lo = 0 for logodds,model in zip(Logodds,models): auc, tpr, positive, negative = compute_chip_auc(chipreads, controlreads, logodds, macs, locs_tolearn) corrA = stats.pearsonr(logodds, np.sqrt(chipreads)) corrB = stats.pearsonr(logodds, np.sqrt(controlreads)) corra = stats.pearsonr(logodds[logodds>lo], np.sqrt(chipreads)[logodds>lo]) corrb = stats.pearsonr(logodds[logodds>lo], np.sqrt(controlreads)[logodds>lo]) corrc = stats.pearsonr(np.sqrt(dnasereads)[logodds>lo], np.sqrt(chipreads)[logodds>lo]) corrd = stats.pearsonr(np.sqrt(dnasereads)[logodds>lo], np.sqrt(controlreads)[logodds>lo]) towrite = [pwmid, model, corrA, corrB, corrC, corrD, corra, corrb, corrc, corrd, auc, tpr, logodds.size, (logodds>np.log(99)).sum()] outhandle.write(' '.join(map(str,towrite))+'\n') figure = viz.plot_correlation(np.sqrt(chipreads), logodds, title=model) pdfhandle.savefig(figure) figure = viz.plot_correlation(np.sqrt(chipreads), np.sqrt(dnasereads), xlabel='sqrt(dnase reads)', title='Total Dnase reads') pdfhandle.savefig(figure) pdfhandle.close() outhandle.close()