def __init__(self, ip_bam, control_bam, dbfn=None): """ Set up a :class:`Chipseq` object. :param ip_bam: filename of BAM file for ChIP data :param control_bam: filename of BAM file for control data :param dbfn: filename of gffutils database """ self.ip = metaseq.genomic_signal(ip_bam, kind='bam') self.control = metaseq.genomic_signal(control_bam, kind='bam') self.dbfn = dbfn self.db = None if self.dbfn: self.db = gffutils.FeatureDB(dbfn) self.ip_array = None self.control_array = None self._strip_kwargs = dict( color='.5', markeredgewidth=0, marker='o', linestyle='None', picker=5) self.browser_plotting_kwargs = [ dict(color='r', label='IP'), dict(color='k', linestyle=':', label='control') ]
def __init__(self, ip_bam, control_bam, dbfn=None): """ Set up a :class:`Chipseq` object. :param ip_bam: filename of BAM file for ChIP data :param control_bam: filename of BAM file for control data :param dbfn: filename of gffutils database """ self.ip = metaseq.genomic_signal(ip_bam, kind='bam') self.control = metaseq.genomic_signal(control_bam, kind='bam') self.dbfn = dbfn self.db = None if self.dbfn: self.db = gffutils.FeatureDB(dbfn) self.ip_array = None self.control_array = None self._strip_kwargs = dict(color='.5', markeredgewidth=0, marker='o', linestyle='None', picker=5) self.browser_plotting_kwargs = [ dict(color='r', label='IP'), dict(color='k', linestyle=':', label='control') ]
def metaseq_heatmap(conditions, bed, counts, window, controls, threads, name): #Must figure out how to work with window # db = gffutils.create_db(gtf, dbfn='test.db', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True) threads = int(threads) fig = plt.figure() ax = fig.add_subplot(111) w_size = float(window)/2 for key in sorted(conditions): ip_signal = metaseq.genomic_signal(key, 'bam') # Create arrays in parallel ip_array = ip_signal.array(bed, bins=100, processes=threads) # Normalize to library size if counts: gapdh = read_counts(counts[key]) #print key, gapdh, float(gapdh)/1000, 1000/float(gapdh) ip_array *= 1000 / float(gapdh) else: ip_array /= ip_signal.mapped_read_count() / 1e6 if controls: input_signal = metaseq.genomic_signal(controls[key], 'bam') input_array = input_signal.array(bed, bins=100, processes=threads) if counts: gapdh = read_counts(counts[key]) input_array *= 1000/ float(gapdh) #Test!!! else: input_array /= input_array.mapped_read_count() / 1e6 x = np.linspace(-w_size, w_size, 100) ax.plot(x, ip_array.mean(axis=0), label=conditions[key]) # Add a vertical line at the TSS if controls: ip_array = ip_array - input_array #needs testing, not sure if working x = np.linspace(-w_size, w_size, 100) fig2 = metaseq.plotutils.imshow(ip_array, x=x, figsize=(7, 10), vmin=5, vmax=99, percentile=True, line_kwargs=dict(color='k', label='All'), fill_kwargs=dict(color='k', alpha=0.3), sort_by=ip_array.mean(axis=1)) fig2.line_axes.set_ylabel('Average enrichment'); fig2.line_axes.set_xlabel('Distance from Center (bp)'); fig2.array_axes.set_ylabel('Peaks') #fig.array_axes.set_xticklabels([]) fig2.array_axes.axvline(0, linestyle=':', color='k') fig2.line_axes.axvline(0, linestyle=':', color='k') fig2.savefig('{}_heatmap_{}.png'.format(conditions[key], name)) plt.close(fig2) ax.axvline(0, linestyle=':', color='k') ax.set_xlabel('Distance from Center (bp)') ax.set_ylabel('Average read coverage (per million mapped reads)') ax.legend(loc=1, fancybox=True, framealpha=0.5, prop={'size':7}) fig.savefig('Average_profile_{}.png'.format(name)) plt.close(fig)
def setup(self): self.m = metaseq.genomic_signal( metaseq.example_filename('gdc.bam'), kind='bam') line = '[%s] %s\n' % (datetime.datetime.now(), self.__class__.__name__) print line sys.stdout.flush() pass
def calculateDependentProfile(otherMarkFile, filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, opPrefix): markFiles = OrderedDict() corrIntervals = OrderedDict() ip = open(otherMarkFile, "r") for line in ip: fields = line.strip().split("\t") markFiles[fields[0]] = metaseq.genomic_signal(fields[1], "bigWig") corrIntervals[fields[0]] = getCorrespondingIntervals( markFiles[fields[0]], filteredPositives) ip.close() for currMark in markFiles: op = open(opPrefix + "_" + currMark + "_metaProfile.dat", "w") op2 = open(opPrefix + "_" + currMark + "_asymProfile.dat", "w") calculateMetaProfile(filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, op, op2, dependent=corrIntervals[currMark]) op.close() op2.close() return
def getSignals(bigWigFiles, bigWigList, currRegions, currSignal): binSize = 25 idx = 0 signalList = [] for currFeature in currRegions: if currFeature.start < 500: currFeature.start = 500 newFeature = pbt.BedTool(currFeature.chrom + " " + str(currFeature.start - 500) + " " + str(currFeature.end + 500), from_string=True)[0] numBP = newFeature.end - newFeature.start numBins = numBP / binSize if numBP % binSize != 0: numBins += 1 newFeature.end += 25 - (newFeature.end % binSize) signal = bigWigList[currSignal].array([newFeature], bins=numBins) signalList.append(signal[0]) idx += 1 if idx % 1000 == 0: del bigWigList[currSignal] bigWigList[currSignal] = metaseq.genomic_signal( bigWigFiles[currSignal], "bigWig") return signalList
def setup(self): self.m = metaseq.genomic_signal(metaseq.example_filename('gdc.bam'), kind='bam') line = '[%s] %s\n' % (datetime.datetime.now(), self.__class__.__name__) print line sys.stdout.flush() pass
def draw_snapshot(sites, bamfiles, color="black", min_y=30, Nsite=5): import metaseq import matplotlib.pyplot as plt from mpl_toolkits.axes_grid1 import Grid Nsites_use = min(Nsite, len(sites)) # take read counts from samples ip_signals = [None]*len(bamfiles) ip_arrays = [None]*len(bamfiles) for i, bamfile in zip(range(len(bamfiles)), bamfiles): ip_signals[i] = metaseq.genomic_signal(bamfile, 'bam') ip_arrays[i] = [None]*Nsites_use for k in range(Nsites_use): ip_arrays[i][k] = ip_signals[i].local_coverage(sites[k]) # draw figure fig = plt.figure(figsize=(100, 20)) grid = Grid(fig, 142, nrows_ncols=(len(bamfiles), Nsites_use), axes_pad=0.05, direction="row", add_all=True, share_all=False, label_mode="all") ymaxs = [None]*len(bamfiles) for i in range(len(bamfiles)): for k in range(Nsites_use): grid[i*Nsites_use+k].bar(ip_arrays[i][k][0], ip_arrays[i][k][1], color=color, edgecolor=color) xmin, xmax, ymin, ymax = grid[i*Nsites_use+k].axis() ymaxs[i] = max(ymaxs[i], ymax) for i in range(len(bamfiles)): for k in range(Nsites_use): xmin, xmax, ymin, ymax = grid[i*Nsites_use+k].axis() grid[i*Nsites_use+k].axis([xmin,xmax,ymin, ymaxs[i]]) grid[i*Nsites_use+k].get_xaxis().set_visible(False) grid[i*Nsites_use+k].get_yaxis().set_visible(False) grid[i*Nsites_use+k].annotate( bamfiles[i].split('/')[-1].split('.')[0] +" [0-"+str(ymaxs[i])+"]", xy=(0,1), xytext=(10, -10), va='top', xycoords='axes fraction', textcoords='offset points', fontsize=25) if i==0: chrom = str(sites[k]).split('\t')[0] start = str(sites[k]).split('\t')[1] end = str(sites[k]).split('\t')[2].split('\n')[0] grid[i*Nsites_use+k].set_title( "Location: " + chrom + " " + start + "-" + end ) return fig
def readBigWigList(ip): bigWigList = OrderedDict() bigWigFiles = OrderedDict() for line in ip: signalType = line.split("\t")[0] filename = line.strip().split("\t")[1] bigWigList[signalType] = metaseq.genomic_signal(filename, "bigWig") bigWigFiles[signalType] = filename return bigWigList, bigWigFiles
def plot_np(bigwig_file, bigbed_file, sizes_file, output_file): """Plot the read coverage and the positive window calls of a single NP for every chromosome. :param bigwig_file: Path to bigwig file for NP read coverage :param bigbed_file: Path to bigbed file for NP positive windows :param sizes_file: Path to chromosome sizes file :param output_file: Path to save image to """ row_names, row_sizes = parse_sizes_file(sizes_file) row_pcts = get_row_pct(row_sizes) axis_sizes = [[int(round(val * 30)) for val in row] for row in row_pcts] read_coverage = genomic_signal(bigwig_file, 'bigwig') positive_windows = genomic_signal(bigbed_file, 'bed') plot_genome(axis_sizes, row_names, row_sizes, read_coverage, positive_windows, output_file)
def readBigWigList(ip): bigWigList = OrderedDict() bigWigFiles = OrderedDict() for line in ip: signalType = line.split("\t")[0] filename = line.strip().split("\t")[1] bigWigList[signalType] = metaseq.genomic_signal(filename, "bigWig") bigWigFiles[signalType] = filename return bigWigList, bigWigFiles
def coverage(Bedfile, Bamfiles, Nproc, bins=None, fragSize=None): import metaseq import numpy as np ip_array = [] for Bamfile in Bamfiles: print("Calculating coverages from : " + Bamfile) ip_signal = metaseq.genomic_signal(Bamfile, 'bam') ip_array.append( ip_signal.array(Bedfile, bins=bins, fragment_size=fragSize, processes=Nproc)) return np.asarray(ip_array)
def getRelevantSignals(bigWigFiles, bigWigList, currRegions, currSignal): binSize = 25 signalList = [] for currFeature in currRegions: numBP = currFeature.end - currFeature.start numBins = numBP/binSize if numBP % binSize != 0: numBins += 1 numBP += 25 - (numBP % binSize) signal = bigWigList[currSignal].array([currFeature], bins=numBins) signalList.append(signal[0]) del bigWigList[currSignal] bigWigList[currSignal] = metaseq.genomic_signal(bigWigFiles[currSignal], "bigWig") return signalList, currRegions
def getRelevantSignals(bigWigFiles, bigWigList, currRegions, currSignal): binSize = 25 signalList = [] for currFeature in currRegions: numBP = currFeature.end - currFeature.start numBins = numBP / binSize if numBP % binSize != 0: numBins += 1 numBP += 25 - (numBP % binSize) signal = bigWigList[currSignal].array([currFeature], bins=numBins) signalList.append(signal[0]) del bigWigList[currSignal] bigWigList[currSignal] = metaseq.genomic_signal(bigWigFiles[currSignal], "bigWig") return signalList, currRegions
def calculateDependentProfile(otherMarkFile, filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, opPrefix): markFiles = OrderedDict() corrIntervals = OrderedDict() ip = open(otherMarkFile, "r") for line in ip: fields = line.strip().split("\t") markFiles[fields[0]] = metaseq.genomic_signal(fields[1], "bigWig") corrIntervals[fields[0]] = getCorrespondingIntervals(markFiles[fields[0]], filteredPositives) ip.close() for currMark in markFiles: op = open(opPrefix + "_" + currMark + "_metaProfile.dat", "w") op2 = open(opPrefix + "_" + currMark + "_asymProfile.dat", "w") calculateMetaProfile(filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, op, op2, dependent=corrIntervals[currMark]) op.close() op2.close() return
def run_metaseq(): # set up a BamSignal object m = metaseq.genomic_signal(metaseq.example_filename("wgEncodeUwTfbsK562CtcfStdAlnRep1.bam"), kind="bam") print "metaseq starting...", sys.stdout.flush() t0 = time.time() # Tweak processes and chunksize as needed to balance CPUs and I/O. PROCESSES = 6 CHUNKSIZE = 100 # the trick is to use a single bin... ms_array = m.array(windows, processes=PROCESSES, chunksize=CHUNKSIZE, bins=1) t1 = time.time() print "completed in %.2fs" % (t1 - t0) sys.stdout.flush() return ms_array.ravel()
def getSignals(bigWigFiles, bigWigList, currRegions, currSignal): binSize = 25 idx = 0 signalList = [] for currFeature in currRegions: if currFeature.start < 500: currFeature.start = 500 newFeature = pbt.BedTool(currFeature.chrom + " " + str(currFeature.start - 500) + " " + str(currFeature.end + 500), from_string=True)[0] numBP = newFeature.end - newFeature.start numBins = numBP/binSize if numBP % binSize != 0: numBins += 1 newFeature.end += 25 - (newFeature.end % binSize) signal = bigWigList[currSignal].array([newFeature], bins=numBins) signalList.append(signal[0]) idx += 1 if idx%1000 == 0: del bigWigList[currSignal] bigWigList[currSignal] = metaseq.genomic_signal(bigWigFiles[currSignal], "bigWig") return signalList
def run_metaseq(): # set up a BamSignal object m = metaseq.genomic_signal(bam_fn, kind='bam') print 'metaseq starting...', sys.stdout.flush() t0 = time.time() # Tweak processes and chunksize as needed to balance CPUs and I/O. PROCESSES = 6 CHUNKSIZE = 100 # the trick is to use a single bin... ms_array = m.array(windows, processes=PROCESSES, chunksize=CHUNKSIZE, bins=1) t1 = time.time() print 'completed in %.2fs' % (t1 - t0) sys.stdout.flush() return ms_array.ravel()
from pybedtools.featurefuncs import TSS from gffutils.helpers import asinterval def tss_generator(): for transcript in db.features_of_type('mRNA'): #CDS/gene/mRNA... yield TSS(asinterval(transcript), upstream=1, downstream=0) tsses = pybedtools.BedTool(tss_generator()).saveas('tsses.gtf') tsses_1kb = tsses.slop(b=1000, genome='hg19', output='tsses-1kb.gtf') import metaseq ip_signal = metaseq.genomic_signal('WT_H2A_Z.sort.bam', 'bam') import multiprocessing processes = multiprocessing.cpu_count() ip_array = ip_signal.array( # Look at signal over these windows tsses_1kb, # Bin signal into this many bins per window bins=100, # Use multiple CPUs. Dramatically speeds up run time. processes=processes)
import multiprocessing from matplotlib import pyplot as plt import matplotlib import numpy as np import metaseq import pybedtools # Use example data and generate some random features gs = metaseq.genomic_signal(metaseq.example_filename('x.bam'), 'bam') features = pybedtools.BedTool()\ .window_maker( b=pybedtools.BedTool('chr2L 0 500000', from_string=True).fn, w=1000)\ .shuffle(seed=1, genome={'chr2L': (0, 5000000)}) genes = [] for i, f in enumerate(features): genes.append('gene_%s' % i) genes = np.array(genes) arr = gs.array(features, processes=multiprocessing.cpu_count(), bins=100) # At this point, each item in `genes` corresponds to the same row in `arr` ind, breaks = metaseq.plotutils.clustered_sortind(arr, k=5) # Boundaries of clusters are provided in `breaks`. # So the first cluster's original indices into `arr` are: cluster_1_inds = ind[0:breaks[0]] # Which means the genes in the first cluster are:
""" module for testing the larger files (x.bam, x.bed.gz, etc) """ import multiprocessing import metaseq import pybedtools CPUS = multiprocessing.cpu_count() gs = {} for kind in ['bam', 'bigwig', 'bed', 'bigbed']: if kind == 'bed': ext = 'bed.gz' else: ext = kind gs[kind] = metaseq.genomic_signal(metaseq.example_filename('x.%s' % ext), kind) # generate the test features features = pybedtools.BedTool()\ .window_maker( b=pybedtools.BedTool('chr2L 0 500000', from_string=True).fn, w=1000)\ .shuffle(seed=1, genome={'chr2L': (0, 5000000)}) args = (features, ) kwargs = dict(processes=CPUS, bins=100) bam_array = gs['bam'].array(*args, **kwargs) bed_array = gs['bed'].array(*args, **kwargs) bw_array = gs['bigwig'].array(*args, method='get_as_array', **kwargs)
'tracks/ChiP-seq_tracks/TLX3_H3K36me3_FE.bw' ] fig = plt.figure(1, (26., 12.)) grid = ImageGrid( fig, 111, nrows_ncols=(1, len(tracks)), # nrows_ncols=(1, 5), axes_pad=0.1, add_all=True, label_mode="R", aspect=False) for i, tr in enumerate(tracks): tit = '_'.join(tr.split('/')[-1].split('_')[:2]) tr_sig = metaseq.genomic_signal(tr, 'bigwig') arr = tr_sig.array(sm_Nkb, bins=bn, processes=processes) if i == 0: k = abs(np.max(arr) - np.min(arr)) vmin = np.min(arr) vmax = np.max(arr) print tit, vmin, vmax i_a = arr.max(axis=1).argsort() #i_a = arr.mean(axis=1).argsort() arr = arr[i_a, :] else: arr = abs(arr[i_a, :]) #arr = k*arr[i_a,:]/(abs(np.max(arr)-np.min(arr))) vmin_e = np.min(arr) vmax_e = np.max(arr) print tit, vmin_e, vmax_e
def signalfy(d, kind): new_d = {} for k, v in d.items(): new_d[k] = metaseq.genomic_signal(v, kind) return new_d
# # pip install numpy # pip install cython # # 4. Install metaseq using pip. If you don't have a lot of the scientific # Python packages or genomic Python packages, this may take a while. # # pip install . import metaseq import pybedtools import numpy as np from matplotlib import pyplot as plt bam = metaseq.genomic_signal('Mcf7Max.sorted.bam', 'bam') cpg = pybedtools.BedTool('cpg.bed') tss = pybedtools.BedTool('HIF_sites_invovled_in_looping_not_at_promoter.bed') # extend by 5 kb up/downstream tss = tss.slop(b=5000, g=pybedtools.chromsizes('hg19')) tss_with_cpg = tss.intersect(cpg, u=True) tss_without_cpg = tss.intersect(cpg, v=True) # change this to as many CPUs as you have in order to run in parallel processes = 1 # each read will be extended 3' to a total size of this many bp fragment_size = 200
import numpy as np import os import metaseq ip_filename = metaseq.helpers.example_filename("wgEncodeHaibTfbsK562Atf3V0416101AlnRep1_chr17.bam") input_filename = metaseq.helpers.example_filename("wgEncodeHaibTfbsK562RxlchV0416101AlnRep1_chr17.bam") ip_signal = metaseq.genomic_signal(ip_filename, "bam") input_signal = metaseq.genomic_signal(input_filename, "bam") # If you already have TSSs, skip this part. import gffutils db = gffutils.FeatureDB(metaseq.example_filename("Homo_sapiens.GRCh37.66_chr17.gtf.db")) import pybedtools from pybedtools.featurefuncs import TSS from gffutils.helpers import asinterval def tss_generator(): for transcript in db.features_of_type("transcript"): yield TSS(asinterval(transcript), upstream=1000, downstream=1000) if not os.path.exists("tsses.gtf"): tsses = pybedtools.BedTool(tss_generator()).saveas("tsses.gtf") tsses = pybedtools.BedTool("tsses.gtf") from metaseq import persistence
def setup(self): self.m = metaseq.genomic_signal( metaseq.example_filename('gdc.bigbed'), kind='bigbed')
if __name__ == "__main__": import sys choices = ['xcorr', 'chipseq'] try: examples = sys.argv[1:] except IndexError: print 'Choices are: ', choices examples = [] for ex in examples: if ex not in choices: raise ValueError('%s not in %s' % (ex, choices)) if 'xcorr' in examples: ip = metaseq.genomic_signal( metaseq.example_filename('wgEncodeUwTfbsK562CtcfStdAlnRep1.bam'), 'bam') NWINDOWS = 5000 FRAGMENT_SIZE = 1 WINDOWSIZE = 5000 THRESH = FRAGMENT_SIZE / float(WINDOWSIZE) * 10 lags, shift = estimate_shift(ip, nwindows=NWINDOWS, maxlag=500, thresh=THRESH, array_kwargs=dict( processes=8, chunksize=100, fragment_size=FRAGMENT_SIZE), verbose=True)
def main(data_dir, peak_file, results, sortBy, list_name): regions=np.loadtxt(peak_file,dtype={'names': ('chr','start','stop','file'),'formats':('S15','int','int','S15')})['file'] peaks=BedTool(peak_file) # if not os.path.exists(results+"_".join(list_name)+'.npz'): print "generating" ip_name1 = peak_file.split("/")[-1].split("_")[0]+".bw" ip_name2 = peak_file.split("/")[-1].split("_")[1].split("_processed")[0]+".bw" print(ip_name1,ip_name2) ip_signal_1 = metaseq.genomic_signal(os.path.join(data_dir,ip_name1),'bigwig') ip_signal_2 = metaseq.genomic_signal(os.path.join(data_dir,ip_name2),'bigwig') ip_array_1 = ip_signal_1.array( peaks, bins=800,processes=processes,method="get_as_array") ip_array_2 = ip_signal_2.array( peaks, bins=800,processes=processes,method="get_as_array") metaseq.persistence.save_features_and_arrays(features=peaks,arrays={'ip1': ip_array_1,'ip2': ip_array_2},prefix=results+"_".join(list_name),link_features=True,overwrite=True) features, arrays = metaseq.persistence.load_features_and_arrays(prefix=results+"_".join(list_name)) ip1_mean=np.apply_along_axis(np.sum,1,arrays['ip1'][::,300:500]) ip2_mean=np.apply_along_axis(np.sum,1,arrays['ip2'][::,300:500]) #print(arrays['ip2']) #CFR=(ip1_mean)/(ip2_mean) #print(CFR) if sortBy == 1: ip1_mean_order=np.argsort(ip1_mean) path_fig = results+"heatmaps_"+list_name[0]+"_"+list_name[1]+"_sorted_on_"+list_name[0]+".png" elif sortBy == 2: ip1_mean_order=np.argsort(ip2_mean) path_fig = results+"heatmaps_"+list_name[0]+"_"+list_name[1]+"_sorted_on_"+list_name[1]+".png" #elif sortBy == 3: #ip1_mean_order=np.argsort(CFR) #path_fig = results+"heatmaps_"+list_name[0]+"_"+list_name[1]+"_sorted_on_CFR.png" #print(ip1_mean_order) ip1=arrays['ip1'][ip1_mean_order,:][::1] ip2=arrays['ip2'][ip1_mean_order,:][::1] #CFR_ordered=CFR[ip1_mean_order][::1] x = np.linspace(-400, 400, 800) regions_ordered=regions[ip1_mean_order][::1] Vmax=max([ip1.mean(),ip2.mean()])*2 Vmin=min([ip1.min(),ip2.min()]) gs = gridspec.GridSpec(2, 2, width_ratios=[8,8],height_ratios=([8,2])) gs.update(wspace=0.02, hspace=0.02) plt.rcParams['font.family'] = 'Arial' plt.rcParams['font.size'] = 10 fig=plt.figure(figsize=(9,5)) ax1=fig.add_subplot(gs[0,0]) ax1.pcolormesh(ip1,vmin=Vmin,vmax=Vmax,cmap=cm.Reds) ax1.set_title(list_name[0]+' coverage') ax1.xaxis.set_ticklabels([]) ax1.xaxis.set_ticks_position('none') ax2=fig.add_subplot(gs[0,1]) ax2.pcolormesh(ip2,vmin=Vmin,vmax=Vmax,cmap=cm.Reds) ax2.set_title(list_name[1]+' coverage') ax2.yaxis.tick_right() ax2.xaxis.set_ticklabels([]) ax2.xaxis.set_ticks_position('none') #ax3=fig.add_subplot(gs[2]) ## CFR_min=CFR_interval[0] ## CFR_max=CFR_interval[1] #CFR_min=CFR.min() #CFR_max=CFR.max() #pcm=ax3.pcolormesh(np.column_stack(((CFR_ordered),(CFR_ordered))),vmin=CFR_min,vmax=3,cmap=cm.Blues) #ax3.set_title('CFR (W='+str(round(CFR_min,2))+', B='+str(round(CFR_max,2))+')') #ax3.yaxis.set_ticklabels([]) #ax3.xaxis.set_ticklabels([]) # ax4=fig.add_subplot(gs[3]) # V_min=Dnase_leaf_ordered.min() # V_max=Dnase_leaf_ordered.max() # pcm=ax4.pcolormesh(np.column_stack(((Dnase_leaf_ordered),(Dnase_leaf_ordered))),vmin=V_min,vmax=V_max,cmap=cm.Blues) # ax4.set_xlabel('Dnase Leaf (W='+str(round(V_min,2))+', B='+str(round(V_max,2))+')') # ax4.yaxis.set_ticklabels([]) # ax4.xaxis.set_ticklabels([]) ax3=fig.add_subplot(gs[1,0]) ax3.plot(x,ip1.mean(axis=0)) ax4=fig.add_subplot(gs[1,1]) ax4.plot(x,ip2.mean(axis=0)) ax4.yaxis.tick_right() #fig.colorbar(pcm, extend='max') #plt.tight_layout() plt.savefig(path_fig)
import multiprocessing from matplotlib import pyplot as plt import matplotlib import numpy as np import metaseq import pybedtools # Use example data and generate some random features gs = metaseq.genomic_signal(metaseq.example_filename('x.bam'), 'bam') features = pybedtools.BedTool()\ .window_maker( b=pybedtools.BedTool('chr2L 0 500000', from_string=True).fn, w=1000)\ .shuffle(seed=1, genome={'chr2L': (0, 5000000)}) genes = [] for i, f in enumerate(features): genes.append('gene_%s' % i) genes = np.array(genes) arr = gs.array(features, processes=multiprocessing.cpu_count(), bins=100) # At this point, each item in `genes` corresponds to the same row in `arr` ind, breaks = metaseq.plotutils.clustered_sortind(arr, k=5) # Boundaries of clusters are provided in `breaks`. # So the first cluster's original indices into `arr` are: cluster_1_inds = ind[0:breaks[0]] # Which means the genes in the first cluster are:
# RAG rag_TLX3 = rel_path + rag_lst["tracks"]["TLX3"][1] rag_H3K4me1 = rel_path + rag_lst["tracks"]["H3K4me1"][1] rag_H3K4me2 = rel_path + rag_lst["tracks"]["H3K4me2"][0] rag_H3K4me3 = rel_path + rag_lst["tracks"]["H3K4me3"][0] rag_H3K9ac = rel_path + rag_lst["tracks"]["H3K9ac"][0] rag_H3K9me3 = rel_path + rag_lst["tracks"]["H3K9me3"][0] rag_H3K27ac = rel_path + rag_lst["tracks"]["H3K27ac"][1] rag_H3K36me3 = rel_path + rag_lst["tracks"]["H3K36me3"][0] rag_H3K27me3 = rel_path + rag_lst["tracks"]["H3K27me3"][0] rag_POLII = rel_path + rag_lst["tracks"]["POLII"][0] # In[8]: tlx_TLX3_sig = metaseq.genomic_signal(tlx_TLX3, 'bigwig') tlx_H3K4me1_sig = metaseq.genomic_signal(tlx_H3K4me1, 'bigwig') tlx_H3K4me2_sig = metaseq.genomic_signal(tlx_H3K4me2, 'bigwig') tlx_H3K4me3_sig = metaseq.genomic_signal(tlx_H3K4me3, 'bigwig') tlx_H3K9ac_sig = metaseq.genomic_signal(tlx_H3K9ac, 'bigwig') tlx_H3K9me3_sig = metaseq.genomic_signal(tlx_H3K9me3, 'bigwig') tlx_H3K27ac_sig = metaseq.genomic_signal(tlx_H3K27ac, 'bigwig') tlx_H3K36me3_sig = metaseq.genomic_signal(tlx_H3K36me3, 'bigwig') tlx_H3K27me3_sig = metaseq.genomic_signal(tlx_H3K27me3, 'bigwig') tlx_POLII_sig = metaseq.genomic_signal(tlx_POLII, 'bigwig') # In[9]: rag_TLX3_sig = metaseq.genomic_signal(rag_TLX3, 'bigwig') rag_H3K4me1_sig = metaseq.genomic_signal(rag_H3K4me1, 'bigwig') rag_H3K4me2_sig = metaseq.genomic_signal(rag_H3K4me2, 'bigwig')
import numpy as np import os import metaseq ip_filename = metaseq.helpers.example_filename( 'wgEncodeHaibTfbsK562Atf3V0416101AlnRep1_chr17.bam') input_filename = metaseq.helpers.example_filename( 'wgEncodeHaibTfbsK562RxlchV0416101AlnRep1_chr17.bam') ip_signal = metaseq.genomic_signal(ip_filename, 'bam') input_signal = metaseq.genomic_signal(input_filename, 'bam') # If you already have TSSs, skip this part. import gffutils db = gffutils.FeatureDB( metaseq.example_filename('Homo_sapiens.GRCh37.66_chr17.gtf.db')) import pybedtools from pybedtools.featurefuncs import TSS from gffutils.helpers import asinterval def tss_generator(): for transcript in db.features_of_type('transcript'): yield TSS(asinterval(transcript), upstream=1000, downstream=1000) if not os.path.exists('tsses.gtf'): tsses = pybedtools.BedTool(tss_generator()).saveas('tsses.gtf') tsses = pybedtools.BedTool('tsses.gtf') from metaseq import persistence
def filterPossibleMatches(bigWigFiles, bigWigList, metaprofile, opPrefix, currSignal, minWidth, maxWidth): currRegions = pbt.BedTool(opPrefix + "_" + currSignal + "_MFpositives.bed") signalList = getSignals(bigWigFiles, bigWigList, currRegions, currSignal) del bigWigList[currSignal] bigWigList[currSignal] = metaseq.genomic_signal(bigWigFiles[currSignal], "bigWig") print len(signalList) binSize = 25 op = open(opPrefix + "_" + currSignal + "_tentPositives.bed", "w") newWidth = minWidth ipFiles = OrderedDict() while newWidth <= maxWidth: ipFiles[newWidth] = open( opPrefix + "_pValue_" + str(newWidth) + ".bed", "r") newWidth += binSize for idx in range(0, len(signalList)): if idx % 1000 == 0: print idx region = currRegions[idx] smoothedSignal = smoothSignals(signalList[idx], win=10) maximaIndices = getMaxima(smoothedSignal) if len(maximaIndices) < 2: continue pairings = findPossiblePairings(maximaIndices, binSize, minWidth, maxWidth) for currPairing in pairings: newWidth = binSize * (currPairing[1] - currPairing[0]) if newWidth < minWidth or newWidth > maxWidth: continue currStart = np.floor((region.start - 500) + (currPairing[0] * binSize) - 5.0 / (len(metaprofile) - 10) * newWidth) + 25 currEnd = np.floor(region.start - 500 + currPairing[1] * binSize + 5.0 / (len(metaprofile) - 10) * newWidth) currChr = region.chrom for line in ipFiles[newWidth]: if "#" in line: headerFields = line.strip().split("\t") currSignalIdx = headerFields.index(currSignal) continue fields = line.strip().split("\t") start = int(fields[1]) chrom = fields[0] end = int(fields[2]) if start > currStart + binSize: break if start <= currStart and end >= currEnd: #accepting this enhancer if it passes cutoff if float(fields[currSignalIdx]) <= 0.001: op.write(chrom + "\t" + str(start) + "\t" + str(end) + "\t" + str(fields[currSignalIdx]) + "\t" + str(newWidth) + "\n") break newWidth = minWidth while newWidth <= maxWidth: ipFiles[newWidth].close() newWidth += binSize op.close() os.system("sortBed -i " + opPrefix + "_" + currSignal + "_tentPositives.bed > " + opPrefix + "_" + currSignal + "_tentPositives2.bed") ip = open(opPrefix + "_" + currSignal + "_tentPositives2.bed", "r") op = open(opPrefix + "_" + currSignal + "_finalPositives.bed", "w") currStart = currEnd = currPvalue = currWidth = 0 currChrom = "" for line in ip: fields = line.strip().split("\t") chrom = fields[0] start = int(fields[1]) end = int(fields[2]) pValue = float(fields[3]) width = int(fields[-1]) if chrom == currChrom and start <= currStart and end >= currEnd: currStart = start currEnd = end currPvalue = pValue currWidth = width currChrom = chrom elif chrom == currChrom and currStart <= start and currEnd >= end: continue else: if currChrom != "": op.write(currChrom + "\t" + str(currStart) + "\t" + str(currEnd) + "\t" + str(currPvalue) + "\t" + str(currWidth) + "\n") currStart = start currEnd = end currPvalue = pValue currWidth = width currChrom = chrom op.close() ip.close() os.system("rm " + opPrefix + "_" + currSignal + "_tentPositives2.bed " + opPrefix + "_" + currSignal + "_tentPositives.bed") return
#~ Metaseq # # --------------------------------------------------------- #~ metaseq works with the concepts of signal and windows. In this example, the signal is ChIP data, and the windows are TSS +/- 1kb. #~ The first step is to create “genomic signal” objects out of the data. Since our example files are BAM files, we specify the kind=’bam’, but if you have your own data in a different format (bigWig, bigBed, BED, GFF, GTF, VCF) then specify that format instead (see metaseq.genomic_signal()). import metaseq ip_file = 'TLX3_H3K4me3_repl1.sorted.bam' #ip_file = 'TLX3_H3K4me3_repl2.sorted.bam' input_file = 'INP-TLX3_1.sorted.bam' #input_file = 'INP-TLX3_2.sorted.bam' ip_signal = metaseq.genomic_signal(os.path.join(data_dir, ip_file), 'bam') input_signal = metaseq.genomic_signal(os.path.join(data_dir, input_file), 'bam') #~ Now we can create the arrays of signal over each window. Since this can be a time-consuming step, the first time this code is run it will cache the arrays on disk. The next time this code is run, it will be quickly loaded. Trigger a re-run by deleting the .npz file. #~ Here, with the BamSignal.array method, we bin each promoter region into 100 bins, and calculate the signal in parallel across as many CPUs as are available. We do this for the IP signal and input signals separately. Then, since these are BAM files of mapped reads, we scale the arrays to the library size. The scaled arrays are then saved to disk, along with the windows that were used to create them. import multiprocessing processes = multiprocessing.cpu_count() if not os.path.exists('example.npz'): # The signal is the IP ChIP-seq BAM file. ip_array = ip_signal.array(
if req not in allowed: raise ValueError("%s not in %s" % (req, allowed)) plt.rcParams['font.size'] = 10 intervals = pybedtools.BedTool().window_maker( genome={args.chrom: (args.start, args.stop)}, n=args.nfeatures)\ .shuffle(genome={args.chrom: (args.start, args.stop)}, seed=1) size = (args.stop - args.start) / args.nfeatures if args.type == 'all': requested = ['bigwig', 'bam', 'bigbed', 'bed'] signals = [] for req in requested: if req != 'bed': signals.append(metaseq.genomic_signal(args.prefix + '.' + req, req)) else: signals.append(metaseq.genomic_signal(args.prefix + '.bed.gz', 'bed')) files = '* ' + '\n * '.join([i.fn for i in signals]) plot_filename = (args.plot_prefix + '-%s_features-%s_bp_chunksize=%s.pdf' % (args.nfeatures, size, args.chunksize)) processes = range(1, multiprocessing.cpu_count() + 1) max_proc = processes[-1] print """ {usage} Parameters ---------- This script will generate {args.nfeatures} random features, each about {size} bp, from genomic coordinates {args.chrom}:{args.start}-{args.stop}.
if __name__ == "__main__": import sys choices = ['xcorr', 'chipseq'] try: examples = sys.argv[1:] except IndexError: print 'Choices are: ', choices examples = [] for ex in examples: if ex not in choices: raise ValueError('%s not in %s' % (ex, choices)) if 'xcorr' in examples: ip = metaseq.genomic_signal( metaseq.example_filename( 'wgEncodeUwTfbsK562CtcfStdAlnRep1.bam'), 'bam') NWINDOWS = 5000 FRAGMENT_SIZE = 1 WINDOWSIZE = 5000 THRESH = FRAGMENT_SIZE / float(WINDOWSIZE) * 10 lags, shift = estimate_shift( ip, nwindows=NWINDOWS, maxlag=500, thresh=THRESH, array_kwargs=dict( processes=8, chunksize=100, fragment_size=FRAGMENT_SIZE), verbose=True) plt.plot(lags, shift.mean(axis=0)) plt.axvline( lags[np.argmax(shift.mean(axis=0))],
def main(histoneFile, MPRApeakFile, nonrandomFile, opPrefix, pp, otherMarkFile=None, plotChar=True): """ The main function of the program. Calculates the pattern of troughs between STARR-seq peaks. Args: histoneFile: bigWigfile of histone signal MPRApeaks: contains peaks from massively parallel reporter assay for regulatory regions nonrandomFile: contains regions on which negatives should not intersect opPrefix: output prefix for all output files Returns: statistics for output """ #Checking input file try: MPRApeaks = pbt.BedTool(MPRApeakFile) except: sys.stderr.write("ERROR: Cannot open MPRA peak file " + MPRApeakFile + "\n") sys.exit() try: histoneSignal = metaseq.genomic_signal(histoneFile, "bigWig") except: sys.stderr.write("ERROR: Cannot open histone signal file " + histoneFile + "\n") sys.exit() try: nrFile = open(nonrandomFile, "r") except: sys.stderr.write("ERROR: " + nonrandomFile + " does not open\n") sys.exit() #Checking output files try: op = open(opPrefix + "_doublePeakStats.txt", "w") op2 = open(opPrefix + "_doublePeak.bed", "w") except: sys.stderr.write("ERROR: Cannot create output files\n") sys.exit() #Read nonrandom intervals nonrandom = bed.Bed() for line in nrFile: currInterval = extractFeature(line.rstrip()) nonrandom.features.append(currInterval) del currInterval nonrandom.sortByChromosomeAndStartAndEnd() #Getting total number of 25 bp intervals that can be negative numNegativesTotal = calculateNumberNegatives(nonrandomFile, histoneFile, opPrefix) print "Number of negatives are", numNegativesTotal del histoneFile smoothingWindow = 2 numberDoublePeaks = 0 numberMinima = 0 metaIntersectingIntervals = [] metaMinima = [] metaMaxima1 = [] metaMaxima2 = [] op.write("Maxima1\tMinima\tMaxima2\tdistanceMaxima1\tdistanceMaxima2\tratioMaxima\tratioMaximaMinima\tdistancePeakMinima\n") doublePeakRegions = [] filteredPositives = getDoublePeakRegions(MPRApeaks, histoneSignal, metaMaxima1, metaMaxima2, metaMinima, op, op2, metaIntersectingIntervals, doublePeakRegions, otherMarkFile) op.close() op2.close() #Getting shape characteristics allShapeCharacteristics = [] readShapeCharacteristics(allShapeCharacteristics, opPrefix + "_doublePeakStats.txt") if plotChar: plotCharacteristics(allShapeCharacteristics, pp) length1 = [] length2 = [] for idx in range(0, len(metaMaxima1)): length1.append(metaMinima[idx] - metaMaxima1[idx]) length2.append(metaMaxima2[idx] - metaMinima[idx]) if length1[-1] <= 0 or length2[-1] <= 0: print metaMaxima1[idx], metaMinima[idx], metaMaxima2[idx] #Calculating metaprofile try: op = open(opPrefix + "_metaProfile.dat", "w") op2 = open(opPrefix + "_asymProfile.dat", "w") except: sys.stderr.write("ERROR: Could not open " + opPrefix + "_metaProfile.dat\n") sys.exit() bins = 2 * max([max(length1), max(length2)]) smoothedMetaProfile = calculateMetaProfile(filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, op, op2) op.close() op2.close() if otherMarkFile != None: calculateDependentProfile(otherMarkFile, filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, opPrefix) return
""" module for testing the larger files (x.bam, x.bed.gz, etc) """ import multiprocessing import metaseq import pybedtools CPUS = multiprocessing.cpu_count() gs = {} for kind in ['bam', 'bigwig', 'bed', 'bigbed']: if kind == 'bed': ext = 'bed.gz' else: ext = kind gs[kind] = metaseq.genomic_signal( metaseq.example_filename('x.%s' % ext), kind) # generate the test features features = pybedtools.BedTool()\ .window_maker( b=pybedtools.BedTool('chr2L 0 500000', from_string=True).fn, w=1000)\ .shuffle(seed=1, genome={'chr2L': (0, 5000000)}) args = (features,) kwargs = dict(processes=CPUS, bins=100) bam_array = gs['bam'].array(*args, **kwargs) bed_array = gs['bed'].array(*args, **kwargs) bw_array = gs['bigwig'].array(*args, method='get_as_array', **kwargs)
def make_vplot(bam_file, tss, prefix, genome, read_len, bins=400, bp_edge=2000, processes=8, greenleaf_norm=True): ''' Take bootstraps, generate V-plots, and get a mean and standard deviation on the plot. Produces 2 plots. One is the aggregation plot alone, while the other also shows the signal at each TSS ordered by strength. ''' vplot_file = '{0}_vplot.png'.format(prefix) vplot_large_file = '{0}_large_vplot.png'.format(prefix) # Load the TSS file tss = pybedtools.BedTool(tss) tss_ext = tss.slop(b=bp_edge, g=genome) # Load the bam file bam = metaseq.genomic_signal(bam_file, 'bam') # Need to shift reads and just get ends, just load bed file? bam_array = bam.array(tss_ext, bins=bins, shift_width = -read_len/2, # Shift to center the read on the cut site processes=processes, stranded=True) # Actually first build an "ends" file #get_ends = '''zcat {0} | awk -F '\t' 'BEGIN {{OFS="\t"}} {{if ($6 == "-") {{$2=$3-1; print}} else {{$3=$2+1; print}} }}' | gzip -c > {1}_ends.bed.gz'''.format(bed_file, prefix) #print(get_ends) #os.system(get_ends) #bed_reads = metaseq.genomic_signal('{0}_ends.bed.gz'.format(prefix), 'bed') #bam_array = bed_reads.array(tss_ext, bins=bins, # processes=processes, stranded=True) # Normalization (Greenleaf style): Find the avg height # at the end bins and take fold change over that if greenleaf_norm: # Use enough bins to cover 100 bp on either end num_edge_bins = int(100/(2*bp_edge/bins)) bin_means = bam_array.mean(axis=0) avg_noise = (sum(bin_means[:num_edge_bins]) + sum(bin_means[-num_edge_bins:]))/(2*num_edge_bins) bam_array /= avg_noise else: bam_array /= bam.mapped_read_count() / 1e6 # Generate a line plot fig = plt.figure() ax = fig.add_subplot(111) x = np.linspace(-bp_edge, bp_edge, bins) ax.plot(x, bam_array.mean(axis=0), color='r', label='Mean') ax.axvline(0, linestyle=':', color='k') # Note the middle high point (TSS) tss_point_val = max(bam_array.mean(axis=0)) ax.set_xlabel('Distance from TSS (bp)') ax.set_ylabel('Average read coverage (per million mapped reads)') ax.legend(loc='best') fig.savefig(vplot_file) # Print a more complicated plot with lots of info # Find a safe upper percentile - we can't use X if the Xth percentile is 0 upper_prct = 99 if mlab.prctile(bam_array.ravel(), upper_prct) == 0.0: upper_prct = 100.0 plt.rcParams['font.size'] = 8 fig = metaseq.plotutils.imshow(bam_array, x=x, figsize=(5, 10), vmin=5, vmax=upper_prct, percentile=True, line_kwargs=dict(color='k', label='All'), fill_kwargs=dict(color='k', alpha=0.3), sort_by=bam_array.mean(axis=1)) # And save the file fig.savefig(vplot_large_file) return vplot_file, vplot_large_file, tss_point_val
def filterPossibleMatches(bigWigFiles, bigWigList, metaprofile, opPrefix, currSignal, minWidth, maxWidth): currRegions = pbt.BedTool(opPrefix + "_" + currSignal + "_MFpositives.bed") signalList = getSignals(bigWigFiles, bigWigList, currRegions, currSignal) del bigWigList[currSignal] bigWigList[currSignal] = metaseq.genomic_signal(bigWigFiles[currSignal], "bigWig") print len(signalList) binSize = 25 op = open(opPrefix + "_" + currSignal + "_tentPositives.bed", "w") newWidth = minWidth ipFiles = OrderedDict() while newWidth <= maxWidth: ipFiles[newWidth] = open(opPrefix + "_pValue_" + str(newWidth) + ".bed", "r") newWidth += binSize for idx in range(0, len(signalList)): if idx % 1000 ==0: print idx region = currRegions[idx] smoothedSignal = smoothSignals(signalList[idx], win=10) maximaIndices = getMaxima(smoothedSignal) if len(maximaIndices) < 2: continue pairings = findPossiblePairings(maximaIndices, binSize, minWidth, maxWidth) for currPairing in pairings: newWidth = binSize * (currPairing[1] - currPairing[0]) if newWidth < minWidth or newWidth > maxWidth: continue currStart = np.floor((region.start - 500) + (currPairing[0] * binSize) - 5.0/(len(metaprofile) - 10) * newWidth) + 25 currEnd = np.floor(region.start - 500 + currPairing[1] * binSize + 5.0/(len(metaprofile) - 10) * newWidth) currChr = region.chrom for line in ipFiles[newWidth]: if "#" in line: headerFields = line.strip().split("\t") currSignalIdx = headerFields.index(currSignal) continue fields = line.strip().split("\t") start = int(fields[1]) chrom = fields[0] end = int(fields[2]) if start > currStart + binSize: break if start <= currStart and end >= currEnd: #accepting this enhancer if it passes cutoff if float(fields[currSignalIdx]) <= 0.001: op.write(chrom + "\t" + str(start) + "\t" + str(end) + "\t" + str(fields[currSignalIdx]) + "\t" + str(newWidth) + "\n") break newWidth = minWidth while newWidth <= maxWidth: ipFiles[newWidth].close() newWidth += binSize op.close() os.system("sortBed -i " + opPrefix + "_" + currSignal + "_tentPositives.bed > " + opPrefix + "_" + currSignal + "_tentPositives2.bed") ip = open(opPrefix + "_" + currSignal + "_tentPositives2.bed" ,"r") op = open(opPrefix + "_" + currSignal + "_finalPositives.bed", "w") currStart = currEnd = currPvalue = currWidth = 0 currChrom = "" for line in ip: fields = line.strip().split("\t") chrom = fields[0] start = int(fields[1]) end = int(fields[2]) pValue = float(fields[3]) width = int(fields[-1]) if chrom == currChrom and start <= currStart and end >= currEnd: currStart = start currEnd = end currPvalue = pValue currWidth = width currChrom = chrom elif chrom == currChrom and currStart <= start and currEnd >= end: continue else: if currChrom != "": op.write(currChrom + "\t" + str(currStart) + "\t" + str(currEnd) + "\t" + str(currPvalue) + "\t" + str(currWidth) + "\n") currStart = start currEnd = end currPvalue = pValue currWidth = width currChrom = chrom op.close() ip.close() os.system("rm " + opPrefix + "_" + currSignal + "_tentPositives2.bed " + opPrefix + "_" + currSignal + "_tentPositives.bed") return
import multiprocessing from matplotlib import pyplot as plt tss_annotation = str(sys.argv[1]) bam_file = str(sys.argv[2]) slop_region = str(sys.argv[3]) bins = str(sys.argv[4]) #read in tss annotation file in bed format tss_bed = pybedtools.BedTool(tss_annotation) # extend by 1000 bp up/downstream tss_slop = tss_bed.slop(b=1000, genome=annotationDir + 'chromLength') bam_gsignal = metaseq.genomic_signal(bam_file, 'bam') # the region +/-500bp around each TSS will be split into a total of 100 bins, # change as needed x = np.linspace(-1000, 1000, bins) # most of the work happens here test1_tss = test1_bam.array(tss_slop, bins=bins, processes=cpus) test2_tss = test2_bam.array(tss_slop, bins=bins, processes=cpus) bc1_tss = bc1_bam.array(tss_slop, bins=bins, processes=cpus) bc2_tss = bc2_bam.array(tss_slop, bins=bins, processes=cpus) mg_tss = mg_bam.array(tss_slop, bins=bins, processes=cpus) tp_tss = tp_bam.array(tss_slop, bins=bins, processes=cpus)
def setup(self): self.m = metaseq.genomic_signal(metaseq.example_filename('gdc.bigbed'), kind='bigbed')
from gffutils.helpers import asinterval def tss_generator(): for transcript in db.features_of_type('mRNA'): #CDS/gene/mRNA... yield TSS(asinterval(transcript), upstream=1, downstream=0) tsses = pybedtools.BedTool(tss_generator()).saveas('tsses.gtf') tsses_1kb = tsses.slop(b=1000, genome='hg19', output='tsses-1kb.gtf') #疑问是基因组文件 import metaseq ip_signal = metaseq.genomic_signal('WT_H2A_Z.sort.bam', 'bam') #输入chip-seq比对后的bam文件 import multiprocessing processes = multiprocessing.cpu_count() ip_array = ip_signal.array( # Look at signal over these windows tsses_1kb, # Bin signal into this many bins per window bins=100, # Use multiple CPUs. Dramatically speeds up run time. processes=processes)
bedtool = pybedtools.BedTool(self.bed) features = bedtool.intersect([feature], u=True) track = Track(features) ax.add_collection(track) # ax.axis('tight') return feature if __name__ == "__main__": import metaseq import gffutils import pybedtools G = gffutils.FeatureDB(metaseq.example_filename("Homo_sapiens.GRCh37.66.cleaned.gtf.db")) ip = metaseq.genomic_signal(metaseq.example_filename("wgEncodeUwTfbsK562CtcfStdAlnRep1.bam"), "bam") inp = metaseq.genomic_signal(metaseq.example_filename("wgEncodeUwTfbsK562InputStdAlnRep1.bam"), "bam") peaks = pybedtools.BedTool(metaseq.example_filename("wgEncodeUwTfbsK562CtcfStdPkRep1.narrowPeak.gz")) plotting_kwargs = [dict(color="r", label="IP"), dict(color="k", linestyle=":", label="input")] local_coverage_kwargs = dict(fragment_size=200) b = SignalMiniBrowser([ip, inp], plotting_kwargs=plotting_kwargs, local_coverage_kwargs=local_coverage_kwargs) g = GeneModelMiniBrowser([ip, inp], G, plotting_kwargs=plotting_kwargs, local_coverage_kwargs=local_coverage_kwargs) p = PeakMiniBrowser([ip, inp], peaks, plotting_kwargs=plotting_kwargs, local_coverage_kwargs=local_coverage_kwargs) feature = peaks[3]
def get_bigwig(name): bw = bigwigs.get(name, None) if bw is None: bw = metaseq.genomic_signal(name,'bigwig') bigwigs[name] = bw return bw
def make_tss_plot(bam_file, tss, prefix, chromsizes, read_len, bins=400, bp_edge=2000, processes=8, greenleaf_norm=True): ''' Take bootstraps, generate tss plots, and get a mean and standard deviation on the plot. Produces 2 plots. One is the aggregation plot alone, while the other also shows the signal at each TSS ordered by strength. ''' logging.info('Generating tss plot...') tss_plot_file = '{0}_tss-enrich.png'.format(prefix) tss_plot_data_file = '{0}_tss-enrich.txt'.format(prefix) tss_plot_large_file = '{0}_large_tss-enrich.png'.format(prefix) # Load the TSS file tss = pybedtools.BedTool(tss) tss_ext = tss.slop(b=bp_edge, g=chromsizes) # Load the bam file # Need to shift reads and just get ends, just load bed file? bam = metaseq.genomic_signal(bam_file, 'bam') bam_array = bam.array(tss_ext, bins=bins, shift_width=-read_len / 2, # Shift to center the read on the cut site processes=processes, stranded=True) # Actually first build an "ends" file #get_ends = '''zcat {0} | awk -F '\t' 'BEGIN {{OFS="\t"}} {{if ($6 == "-") {{$2=$3-1; print}} else {{$3=$2+1; print}} }}' | gzip -c > {1}_ends.bed.gz'''.format(bed_file, prefix) # print(get_ends) # os.system(get_ends) #bed_reads = metaseq.genomic_signal('{0}_ends.bed.gz'.format(prefix), 'bed') # bam_array = bed_reads.array(tss_ext, bins=bins, # processes=processes, stranded=True) # Normalization (Greenleaf style): Find the avg height # at the end bins and take fold change over that if greenleaf_norm: # Use enough bins to cover 100 bp on either end num_edge_bins = int(100 / (2 * bp_edge / bins)) bin_means = bam_array.mean(axis=0) avg_noise = (sum(bin_means[:num_edge_bins]) + sum(bin_means[-num_edge_bins:] ))/(2*num_edge_bins) bam_array /= avg_noise else: bam_array /= bam.mapped_read_count() / 1e6 # Generate a line plot fig = plt.figure() ax = fig.add_subplot(111) x = np.linspace(-bp_edge, bp_edge, bins) ax.plot(x, bam_array.mean(axis=0), color='r', label='Mean') ax.axvline(0, linestyle=':', color='k') # Note the middle high point (TSS) tss_point_val = max(bam_array.mean(axis=0)) ax.set_xlabel('Distance from TSS (bp)') ax.set_ylabel('Average read coverage (per million mapped reads)') ax.legend(loc='best') fig.savefig(tss_plot_file) # Print a more complicated plot with lots of info # write the plot data; numpy object np.savetxt(tss_plot_data_file, bam_array.mean(axis=0), delimiter=",") # Find a safe upper percentile - we can't use X if the Xth percentile is 0 upper_prct = 99 if mlab.prctile(bam_array.ravel(), upper_prct) == 0.0: upper_prct = 100.0 plt.rcParams['font.size'] = 8 fig = metaseq.plotutils.imshow(bam_array, x=x, figsize=(5, 10), vmin=5, vmax=upper_prct, percentile=True, line_kwargs=dict(color='k', label='All'), fill_kwargs=dict(color='k', alpha=0.3), sort_by=bam_array.mean(axis=1)) # And save the file fig.savefig(tss_plot_large_file) return tss_plot_file, tss_plot_large_file, tss_point_val
#~ tsses_2kb = tsses.slop(b=2000, genome='mm9', output=join(ref_dir,'mm9_tsses-2kb.gtf')) #~ tsses_3kb = tsses.slop(b=3000, genome='mm9', output=join(ref_dir,'mm9_tsses-3kb.gtf')) #~ tsses_5kb = tsses.slop(b=5000, genome='mm9', output=join(ref_dir,'mm9_tsses-5kb.gtf')) tsses_1kb = pb.BedTool(join(ref_dir,'mm9_tsses-1kb.gtf')) # --------------------------------------------------------- #~ Metaseq # # --------------------------------------------------------- #~ metaseq works with the concepts of signal and windows. In this example, the signal is ChIP data, and the windows are TSS +/- 1kb. #~ The first step is to create "genomic signal" objects out of the data. Since our example files are BAM files, we specify the kind='bam', but if you have your own data in a different format (bigWig, bigBed, BED, GFF, GTF, VCF) then specify that format instead (see metaseq.genomic_signal()). #~ tlx_bdg = rel_path+tlx_lst["tracks"][0] #~ rag_bdg = rel_path+rag_lst["tracks"][0] tlx_bw = rel_path+tlx_lst["tracks"][2] rag_bw = rel_path+rag_lst["tracks"][2] print tlx_bw, rag_bw tlx_signal = metaseq.genomic_signal(tlx_bw,'bigwig') rag_signal = metaseq.genomic_signal(rag_bw,'bigwig')
""" Many of these tests use the minimal test/data/gdc.bed file which has just enough complexity to be useful in testing corner cases. When reading through the tests, it's useful to have that file open to understand what's happening. """ import os import metaseq import multiprocessing from metaseq.array_helpers import ArgumentError import numpy as np from nose.tools import assert_raises from nose.plugins.skip import SkipTest gs = {} for kind in ['bed', 'bam', 'bigbed', 'bigwig']: gs[kind] = metaseq.genomic_signal(metaseq.example_filename('gdc.%s' % kind), kind) PROCESSES = int(os.environ.get("METASEQ_PROCESSES", multiprocessing.cpu_count())) def test_tointerval(): assert metaseq.helpers.tointerval("chr2L:1-10[-]").strand == '-' assert metaseq.helpers.tointerval("chr2L:1-10[+]").strand == '+' assert metaseq.helpers.tointerval("chr2L:1-10").strand == '.' def test_local_count(): def check(kind, coord, expected, stranded): try: result = gs[kind].local_count(coord, stranded=stranded) except NotImplementedError: raise SkipTest("Incompatible bx-python version for bigBed")
from pybedtools.featurefuncs import TSS from gffutils.helpers import asinterval def tss_generator(): for transcript in db.features_of_type('mRNA'): #CDS/gene/mRNA... yield TSS(asinterval(transcript), upstream=1, downstream=0) tsses = pybedtools.BedTool(tss_generator()).saveas('tsses.gtf') tsses_1kb = tsses.slop(b=1000, genome='hg19', output='tsses-1kb.gtf') #疑问是基因组文件 import metaseq ip_signal = metaseq.genomic_signal('arp6_H2A_Z.sort.bam', 'bam') #输入突变体chip-seq比对后的bam文件 input_signal = metaseq.genomic_signal('WT_H2A_Z.sort.bam', 'bam') #输入野生型的bam文件 import multiprocessing processes = multiprocessing.cpu_count() ip_array = ip_signal.array( # Look at signal over these windows tsses_1kb, # Bin signal into this many bins per window bins=100, # Use multiple CPUs. Dramatically speeds up run time. processes=processes)
def main(histoneFile, MPRApeakFile, nonrandomFile, opPrefix, pp, otherMarkFile=None, plotChar=True): """ The main function of the program. Calculates the pattern of troughs between STARR-seq peaks. Args: histoneFile: bigWigfile of histone signal MPRApeaks: contains peaks from massively parallel reporter assay for regulatory regions nonrandomFile: contains regions on which negatives should not intersect opPrefix: output prefix for all output files Returns: statistics for output """ #Checking input file try: MPRApeaks = pbt.BedTool(MPRApeakFile) except: sys.stderr.write("ERROR: Cannot open MPRA peak file " + MPRApeakFile + "\n") sys.exit() try: histoneSignal = metaseq.genomic_signal(histoneFile, "bigWig") except: sys.stderr.write("ERROR: Cannot open histone signal file " + histoneFile + "\n") sys.exit() try: nrFile = open(nonrandomFile, "r") except: sys.stderr.write("ERROR: " + nonrandomFile + " does not open\n") sys.exit() #Checking output files try: op = open(opPrefix + "_doublePeakStats.txt", "w") op2 = open(opPrefix + "_doublePeak.bed", "w") except: sys.stderr.write("ERROR: Cannot create output files\n") sys.exit() #Read nonrandom intervals nonrandom = bed.Bed() for line in nrFile: currInterval = extractFeature(line.rstrip()) nonrandom.features.append(currInterval) del currInterval nonrandom.sortByChromosomeAndStartAndEnd() #Getting total number of 25 bp intervals that can be negative numNegativesTotal = calculateNumberNegatives(nonrandomFile, histoneFile, opPrefix) print "Number of negatives are", numNegativesTotal del histoneFile smoothingWindow = 2 numberDoublePeaks = 0 numberMinima = 0 metaIntersectingIntervals = [] metaMinima = [] metaMaxima1 = [] metaMaxima2 = [] op.write( "Maxima1\tMinima\tMaxima2\tdistanceMaxima1\tdistanceMaxima2\tratioMaxima\tratioMaximaMinima\tdistancePeakMinima\n" ) doublePeakRegions = [] filteredPositives = getDoublePeakRegions(MPRApeaks, histoneSignal, metaMaxima1, metaMaxima2, metaMinima, op, op2, metaIntersectingIntervals, doublePeakRegions, otherMarkFile) op.close() op2.close() #Getting shape characteristics allShapeCharacteristics = [] readShapeCharacteristics(allShapeCharacteristics, opPrefix + "_doublePeakStats.txt") if plotChar: plotCharacteristics(allShapeCharacteristics, pp) length1 = [] length2 = [] for idx in range(0, len(metaMaxima1)): length1.append(metaMinima[idx] - metaMaxima1[idx]) length2.append(metaMaxima2[idx] - metaMinima[idx]) if length1[-1] <= 0 or length2[-1] <= 0: print metaMaxima1[idx], metaMinima[idx], metaMaxima2[idx] #Calculating metaprofile try: op = open(opPrefix + "_metaProfile.dat", "w") op2 = open(opPrefix + "_asymProfile.dat", "w") except: sys.stderr.write("ERROR: Could not open " + opPrefix + "_metaProfile.dat\n") sys.exit() bins = 2 * max([max(length1), max(length2)]) smoothedMetaProfile = calculateMetaProfile(filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, op, op2) op.close() op2.close() if otherMarkFile != None: calculateDependentProfile(otherMarkFile, filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, opPrefix) return
if score != 0: fout.write('\t'.join([ feature.chrom, str(start), str(stop), str(score)]) + '\n') start = start + binsize this_batch = [] i = 0 fout.close() if __name__ == "__main__": import metaseq ip_bam = metaseq.genomic_signal( metaseq.example_filename( 'wgEncodeUwTfbsK562CtcfStdAlnRep1.bam'), 'bam') control_bam = metaseq.genomic_signal( metaseq.example_filename( 'wgEncodeUwTfbsK562InputStdAlnRep1.bam'), 'bam') BINSIZE = 10 WINDOWSIZE = 10000 BINS = WINDOWSIZE / BINSIZE features = pybedtools.BedTool()\ .window_maker(genome='hg19', w=WINDOWSIZE)\ .filter(lambda x: x.chrom == 'chr19') result = compare( signal1=ip_bam, signal2=control_bam,