Exemplo n.º 1
0
    def __init__(self, ip_bam, control_bam, dbfn=None):
        """
        Set up a :class:`Chipseq` object.


        :param ip_bam: filename of BAM file for ChIP data
        :param control_bam: filename of BAM file for control data
        :param dbfn: filename of gffutils database
        """
        self.ip = metaseq.genomic_signal(ip_bam, kind='bam')
        self.control = metaseq.genomic_signal(control_bam, kind='bam')
        self.dbfn = dbfn
        self.db = None
        if self.dbfn:
            self.db = gffutils.FeatureDB(dbfn)
        self.ip_array = None
        self.control_array = None

        self._strip_kwargs = dict(
            color='.5', markeredgewidth=0, marker='o', linestyle='None',
            picker=5)
        self.browser_plotting_kwargs = [
            dict(color='r', label='IP'),
            dict(color='k', linestyle=':', label='control')
        ]
Exemplo n.º 2
0
    def __init__(self, ip_bam, control_bam, dbfn=None):
        """
        Set up a :class:`Chipseq` object.


        :param ip_bam: filename of BAM file for ChIP data
        :param control_bam: filename of BAM file for control data
        :param dbfn: filename of gffutils database
        """
        self.ip = metaseq.genomic_signal(ip_bam, kind='bam')
        self.control = metaseq.genomic_signal(control_bam, kind='bam')
        self.dbfn = dbfn
        self.db = None
        if self.dbfn:
            self.db = gffutils.FeatureDB(dbfn)
        self.ip_array = None
        self.control_array = None

        self._strip_kwargs = dict(color='.5',
                                  markeredgewidth=0,
                                  marker='o',
                                  linestyle='None',
                                  picker=5)
        self.browser_plotting_kwargs = [
            dict(color='r', label='IP'),
            dict(color='k', linestyle=':', label='control')
        ]
Exemplo n.º 3
0
def metaseq_heatmap(conditions, bed, counts, window, controls, threads, name):
	#Must figure out how to work with window
#	db = gffutils.create_db(gtf, dbfn='test.db', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
	threads = int(threads)
	fig = plt.figure()
	ax = fig.add_subplot(111)
	w_size = float(window)/2
	for key in sorted(conditions):
		ip_signal = metaseq.genomic_signal(key, 'bam')
		# Create arrays in parallel
		ip_array = ip_signal.array(bed, bins=100, processes=threads)
		# Normalize to library size
		if counts:
			gapdh = read_counts(counts[key])
			#print key, gapdh, float(gapdh)/1000,  1000/float(gapdh)
			ip_array *= 1000 / float(gapdh)
		else:
			ip_array /= ip_signal.mapped_read_count() / 1e6
		
		if controls:
			input_signal = metaseq.genomic_signal(controls[key], 'bam')
			input_array = input_signal.array(bed, bins=100, processes=threads)
			if counts:
				gapdh = read_counts(counts[key])
				input_array *= 1000/ float(gapdh)  #Test!!!
			else:
				input_array /= input_array.mapped_read_count() / 1e6
				
		x = np.linspace(-w_size, w_size, 100)
		ax.plot(x, ip_array.mean(axis=0), label=conditions[key])
		# Add a vertical line at the TSS
		
		if controls:
			ip_array = ip_array - input_array #needs testing, not sure if working

		x = np.linspace(-w_size, w_size, 100)
		fig2 = metaseq.plotutils.imshow(ip_array, x=x, figsize=(7, 10),
		vmin=5, vmax=99,  percentile=True,
		line_kwargs=dict(color='k', label='All'),
		fill_kwargs=dict(color='k', alpha=0.3),
		sort_by=ip_array.mean(axis=1))

		fig2.line_axes.set_ylabel('Average enrichment');
		fig2.line_axes.set_xlabel('Distance from Center (bp)');
		fig2.array_axes.set_ylabel('Peaks')
		#fig.array_axes.set_xticklabels([])
		fig2.array_axes.axvline(0, linestyle=':', color='k')
		fig2.line_axes.axvline(0, linestyle=':', color='k')
		fig2.savefig('{}_heatmap_{}.png'.format(conditions[key], name))
		plt.close(fig2)
	ax.axvline(0, linestyle=':', color='k')
	ax.set_xlabel('Distance from Center (bp)')
	ax.set_ylabel('Average read coverage (per million mapped reads)')
	ax.legend(loc=1, fancybox=True, framealpha=0.5, prop={'size':7})
	fig.savefig('Average_profile_{}.png'.format(name))
	plt.close(fig)
Exemplo n.º 4
0
 def setup(self):
     self.m = metaseq.genomic_signal(
             metaseq.example_filename('gdc.bam'), kind='bam')
     line = '[%s] %s\n' % (datetime.datetime.now(), self.__class__.__name__)
     print line
     sys.stdout.flush()
     pass
Exemplo n.º 5
0
def calculateDependentProfile(otherMarkFile, filteredPositives,
                              metaIntersectingIntervals, bins, metaMaxima1,
                              metaMaxima2, metaMinima, pp, opPrefix):
    markFiles = OrderedDict()
    corrIntervals = OrderedDict()
    ip = open(otherMarkFile, "r")
    for line in ip:
        fields = line.strip().split("\t")
        markFiles[fields[0]] = metaseq.genomic_signal(fields[1], "bigWig")
        corrIntervals[fields[0]] = getCorrespondingIntervals(
            markFiles[fields[0]], filteredPositives)
    ip.close()

    for currMark in markFiles:
        op = open(opPrefix + "_" + currMark + "_metaProfile.dat", "w")
        op2 = open(opPrefix + "_" + currMark + "_asymProfile.dat", "w")
        calculateMetaProfile(filteredPositives,
                             metaIntersectingIntervals,
                             bins,
                             metaMaxima1,
                             metaMaxima2,
                             metaMinima,
                             pp,
                             op,
                             op2,
                             dependent=corrIntervals[currMark])
        op.close()
        op2.close()

    return
Exemplo n.º 6
0
def getSignals(bigWigFiles, bigWigList, currRegions, currSignal):
    binSize = 25
    idx = 0
    signalList = []
    for currFeature in currRegions:
        if currFeature.start < 500:
            currFeature.start = 500
        newFeature = pbt.BedTool(currFeature.chrom + " " +
                                 str(currFeature.start - 500) + " " +
                                 str(currFeature.end + 500),
                                 from_string=True)[0]
        numBP = newFeature.end - newFeature.start
        numBins = numBP / binSize
        if numBP % binSize != 0:
            numBins += 1
            newFeature.end += 25 - (newFeature.end % binSize)
        signal = bigWigList[currSignal].array([newFeature], bins=numBins)
        signalList.append(signal[0])
        idx += 1
        if idx % 1000 == 0:
            del bigWigList[currSignal]
            bigWigList[currSignal] = metaseq.genomic_signal(
                bigWigFiles[currSignal], "bigWig")

    return signalList
Exemplo n.º 7
0
 def setup(self):
     self.m = metaseq.genomic_signal(metaseq.example_filename('gdc.bam'),
                                     kind='bam')
     line = '[%s] %s\n' % (datetime.datetime.now(), self.__class__.__name__)
     print line
     sys.stdout.flush()
     pass
Exemplo n.º 8
0
def draw_snapshot(sites, bamfiles, color="black", min_y=30, Nsite=5):
    import metaseq
    import matplotlib.pyplot as plt
    from mpl_toolkits.axes_grid1 import Grid


    Nsites_use = min(Nsite, len(sites))

    # take read counts from samples
    ip_signals = [None]*len(bamfiles)
    ip_arrays = [None]*len(bamfiles)
    for i, bamfile in zip(range(len(bamfiles)), bamfiles):
        ip_signals[i] = metaseq.genomic_signal(bamfile, 'bam')
        ip_arrays[i] = [None]*Nsites_use
        for k in range(Nsites_use):
            ip_arrays[i][k] = ip_signals[i].local_coverage(sites[k])

    # draw figure
    fig = plt.figure(figsize=(100, 20))

    grid = Grid(fig, 142, nrows_ncols=(len(bamfiles), Nsites_use),
            axes_pad=0.05, direction="row",
            add_all=True, share_all=False, label_mode="all")

    ymaxs = [None]*len(bamfiles)

    for i in range(len(bamfiles)):
        for k in range(Nsites_use):
            grid[i*Nsites_use+k].bar(ip_arrays[i][k][0],
                    ip_arrays[i][k][1], color=color, edgecolor=color)
            xmin, xmax, ymin, ymax = grid[i*Nsites_use+k].axis()
            ymaxs[i] = max(ymaxs[i], ymax)


    for i in range(len(bamfiles)):
        for k in range(Nsites_use):
            xmin, xmax, ymin, ymax = grid[i*Nsites_use+k].axis()
            grid[i*Nsites_use+k].axis([xmin,xmax,ymin, ymaxs[i]])
            grid[i*Nsites_use+k].get_xaxis().set_visible(False)
            grid[i*Nsites_use+k].get_yaxis().set_visible(False)
            grid[i*Nsites_use+k].annotate(
                    bamfiles[i].split('/')[-1].split('.')[0]
                    +" [0-"+str(ymaxs[i])+"]",
                    xy=(0,1), xytext=(10, -10),
                    va='top', xycoords='axes fraction',
                    textcoords='offset points',
                    fontsize=25)
            if i==0:
                chrom = str(sites[k]).split('\t')[0]
                start = str(sites[k]).split('\t')[1]
                end = str(sites[k]).split('\t')[2].split('\n')[0]
                grid[i*Nsites_use+k].set_title(
                        "Location: " + chrom + " " + start + "-" + end
                        )

    return fig
Exemplo n.º 9
0
def readBigWigList(ip):
    bigWigList = OrderedDict()
    bigWigFiles = OrderedDict()
    for line in ip:
        signalType = line.split("\t")[0]
        filename = line.strip().split("\t")[1]
        bigWigList[signalType] = metaseq.genomic_signal(filename, "bigWig")
        bigWigFiles[signalType] = filename

    return bigWigList, bigWigFiles
Exemplo n.º 10
0
def plot_np(bigwig_file, bigbed_file, sizes_file, output_file):
    """Plot the read coverage and the positive window calls of a single NP
    for every chromosome.

    :param bigwig_file: Path to bigwig file for NP read coverage
    :param bigbed_file: Path to bigbed file for NP positive windows
    :param sizes_file: Path to chromosome sizes file
    :param output_file: Path to save image to
    """

    row_names, row_sizes = parse_sizes_file(sizes_file)
    row_pcts = get_row_pct(row_sizes)
    axis_sizes = [[int(round(val * 30)) for val in row] for row in row_pcts]

    read_coverage = genomic_signal(bigwig_file, 'bigwig')
    positive_windows = genomic_signal(bigbed_file, 'bed')

    plot_genome(axis_sizes, row_names, row_sizes, read_coverage,
                positive_windows, output_file)
Exemplo n.º 11
0
def readBigWigList(ip):
	bigWigList = OrderedDict()
	bigWigFiles = OrderedDict()
	for line in ip:
		signalType = line.split("\t")[0]
		filename = line.strip().split("\t")[1]
		bigWigList[signalType] =  metaseq.genomic_signal(filename, "bigWig")
		bigWigFiles[signalType] = filename

	return bigWigList, bigWigFiles
Exemplo n.º 12
0
def coverage(Bedfile, Bamfiles, Nproc, bins=None, fragSize=None):
    import metaseq
    import numpy as np

    ip_array = []
    for Bamfile in Bamfiles:
        print("Calculating coverages from : " + Bamfile)
        ip_signal = metaseq.genomic_signal(Bamfile, 'bam')
        ip_array.append(
            ip_signal.array(Bedfile,
                            bins=bins,
                            fragment_size=fragSize,
                            processes=Nproc))

    return np.asarray(ip_array)
Exemplo n.º 13
0
def getRelevantSignals(bigWigFiles, bigWigList, currRegions, currSignal):
	binSize = 25
	signalList = []
	
	for currFeature in currRegions:
		numBP = currFeature.end - currFeature.start
		numBins = numBP/binSize
		if numBP % binSize != 0:
			numBins += 1
			numBP += 25 - (numBP % binSize)
		signal = bigWigList[currSignal].array([currFeature], bins=numBins)
		signalList.append(signal[0])

	del bigWigList[currSignal] 
	bigWigList[currSignal] =  metaseq.genomic_signal(bigWigFiles[currSignal], "bigWig")
	return signalList, currRegions
Exemplo n.º 14
0
def getRelevantSignals(bigWigFiles, bigWigList, currRegions, currSignal):
    binSize = 25
    signalList = []

    for currFeature in currRegions:
        numBP = currFeature.end - currFeature.start
        numBins = numBP / binSize
        if numBP % binSize != 0:
            numBins += 1
            numBP += 25 - (numBP % binSize)
        signal = bigWigList[currSignal].array([currFeature], bins=numBins)
        signalList.append(signal[0])

    del bigWigList[currSignal]
    bigWigList[currSignal] = metaseq.genomic_signal(bigWigFiles[currSignal],
                                                    "bigWig")
    return signalList, currRegions
Exemplo n.º 15
0
def calculateDependentProfile(otherMarkFile, filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, opPrefix):
	markFiles = OrderedDict()
	corrIntervals = OrderedDict()
	ip = open(otherMarkFile, "r")
	for line in ip:
		fields = line.strip().split("\t")
		markFiles[fields[0]] = metaseq.genomic_signal(fields[1], "bigWig")
		corrIntervals[fields[0]] = getCorrespondingIntervals(markFiles[fields[0]], filteredPositives)
	ip.close()

	for currMark in markFiles:
		op = open(opPrefix + "_" + currMark + "_metaProfile.dat", "w")
		op2 = open(opPrefix + "_" + currMark + "_asymProfile.dat", "w")
		calculateMetaProfile(filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, op, op2, dependent=corrIntervals[currMark])
		op.close()
		op2.close()

	return
Exemplo n.º 16
0
def run_metaseq():
    # set up a BamSignal object
    m = metaseq.genomic_signal(metaseq.example_filename("wgEncodeUwTfbsK562CtcfStdAlnRep1.bam"), kind="bam")

    print "metaseq starting...",
    sys.stdout.flush()
    t0 = time.time()

    # Tweak processes and chunksize as needed to balance CPUs and I/O.
    PROCESSES = 6
    CHUNKSIZE = 100

    # the trick is to use a single bin...
    ms_array = m.array(windows, processes=PROCESSES, chunksize=CHUNKSIZE, bins=1)

    t1 = time.time()
    print "completed in %.2fs" % (t1 - t0)
    sys.stdout.flush()
    return ms_array.ravel()
Exemplo n.º 17
0
def getSignals(bigWigFiles, bigWigList, currRegions, currSignal):
	binSize = 25
	idx = 0 
	signalList = []
	for currFeature in currRegions:
		if currFeature.start  < 500:
			currFeature.start = 500
		newFeature = pbt.BedTool(currFeature.chrom + " " + str(currFeature.start - 500) + " " + str(currFeature.end + 500), from_string=True)[0]
		numBP = newFeature.end - newFeature.start
		numBins = numBP/binSize
		if numBP % binSize != 0:
			numBins += 1
			newFeature.end += 25 - (newFeature.end % binSize)
		signal = bigWigList[currSignal].array([newFeature], bins=numBins)
		signalList.append(signal[0])
		idx += 1
		if idx%1000 == 0:
			del bigWigList[currSignal] 
			bigWigList[currSignal] =  metaseq.genomic_signal(bigWigFiles[currSignal], "bigWig")

	return signalList	
Exemplo n.º 18
0
def run_metaseq():
    # set up a BamSignal object
    m = metaseq.genomic_signal(bam_fn, kind='bam')

    print 'metaseq starting...',
    sys.stdout.flush()
    t0 = time.time()

    # Tweak processes and chunksize as needed to balance CPUs and I/O.
    PROCESSES = 6
    CHUNKSIZE = 100

    # the trick is to use a single bin...
    ms_array = m.array(windows,
                       processes=PROCESSES,
                       chunksize=CHUNKSIZE,
                       bins=1)

    t1 = time.time()
    print 'completed in %.2fs' % (t1 - t0)
    sys.stdout.flush()
    return ms_array.ravel()
Exemplo n.º 19
0
from pybedtools.featurefuncs import TSS

from gffutils.helpers import asinterval


def tss_generator():
    for transcript in db.features_of_type('mRNA'):  #CDS/gene/mRNA...
        yield TSS(asinterval(transcript), upstream=1, downstream=0)


tsses = pybedtools.BedTool(tss_generator()).saveas('tsses.gtf')
tsses_1kb = tsses.slop(b=1000, genome='hg19', output='tsses-1kb.gtf')

import metaseq
ip_signal = metaseq.genomic_signal('WT_H2A_Z.sort.bam', 'bam')

import multiprocessing
processes = multiprocessing.cpu_count()

ip_array = ip_signal.array(

    # Look at signal over these windows
    tsses_1kb,

    # Bin signal into this many bins per window
    bins=100,

    # Use multiple CPUs. Dramatically speeds up run time.
    processes=processes)
Exemplo n.º 20
0
import multiprocessing
from matplotlib import pyplot as plt
import matplotlib
import numpy as np
import metaseq
import pybedtools

# Use example data and generate some random features
gs = metaseq.genomic_signal(metaseq.example_filename('x.bam'), 'bam')

features = pybedtools.BedTool()\
    .window_maker(
        b=pybedtools.BedTool('chr2L 0 500000',
                             from_string=True).fn,
        w=1000)\
    .shuffle(seed=1, genome={'chr2L': (0, 5000000)})
genes = []
for i, f in enumerate(features):
    genes.append('gene_%s' % i)
genes = np.array(genes)
arr = gs.array(features, processes=multiprocessing.cpu_count(), bins=100)

# At this point, each item in `genes` corresponds to the same row in `arr`

ind, breaks = metaseq.plotutils.clustered_sortind(arr, k=5)

# Boundaries of clusters are provided in `breaks`.
# So the first cluster's original indices into `arr` are:
cluster_1_inds = ind[0:breaks[0]]

# Which means the genes in the first cluster are:
Exemplo n.º 21
0
"""
module for testing the larger files (x.bam, x.bed.gz, etc)
"""
import multiprocessing
import metaseq
import pybedtools

CPUS = multiprocessing.cpu_count()

gs = {}
for kind in ['bam', 'bigwig', 'bed', 'bigbed']:
    if kind == 'bed':
        ext = 'bed.gz'
    else:
        ext = kind
    gs[kind] = metaseq.genomic_signal(metaseq.example_filename('x.%s' % ext),
                                      kind)

# generate the test features
features = pybedtools.BedTool()\
        .window_maker(
            b=pybedtools.BedTool('chr2L 0 500000',
                                 from_string=True).fn,
            w=1000)\
        .shuffle(seed=1,
                 genome={'chr2L': (0, 5000000)})

args = (features, )
kwargs = dict(processes=CPUS, bins=100)
bam_array = gs['bam'].array(*args, **kwargs)
bed_array = gs['bed'].array(*args, **kwargs)
bw_array = gs['bigwig'].array(*args, method='get_as_array', **kwargs)
Exemplo n.º 22
0
    'tracks/ChiP-seq_tracks/TLX3_H3K36me3_FE.bw'
]

fig = plt.figure(1, (26., 12.))
grid = ImageGrid(
    fig,
    111,
    nrows_ncols=(1, len(tracks)),  # nrows_ncols=(1, 5),
    axes_pad=0.1,
    add_all=True,
    label_mode="R",
    aspect=False)

for i, tr in enumerate(tracks):
    tit = '_'.join(tr.split('/')[-1].split('_')[:2])
    tr_sig = metaseq.genomic_signal(tr, 'bigwig')
    arr = tr_sig.array(sm_Nkb, bins=bn, processes=processes)
    if i == 0:
        k = abs(np.max(arr) - np.min(arr))
        vmin = np.min(arr)
        vmax = np.max(arr)
        print tit, vmin, vmax
        i_a = arr.max(axis=1).argsort()
        #i_a = arr.mean(axis=1).argsort()
        arr = arr[i_a, :]
    else:
        arr = abs(arr[i_a, :])
        #arr = k*arr[i_a,:]/(abs(np.max(arr)-np.min(arr)))
        vmin_e = np.min(arr)
        vmax_e = np.max(arr)
        print tit, vmin_e, vmax_e
Exemplo n.º 23
0
Arquivo: files.py Projeto: xjyx/nature
def signalfy(d, kind):
    new_d = {}
    for k, v in d.items():
        new_d[k] = metaseq.genomic_signal(v, kind)
    return new_d
Exemplo n.º 24
0
#
#    pip install numpy
#    pip install cython
#
# 4. Install metaseq using pip.  If you don't have a lot of the scientific 
#    Python packages or genomic Python packages, this may take a while.
#  
#    pip install .
 
 
import metaseq
import pybedtools
import numpy as np
from matplotlib import pyplot as plt
 
bam = metaseq.genomic_signal('Mcf7Max.sorted.bam', 'bam')
cpg = pybedtools.BedTool('cpg.bed')
tss = pybedtools.BedTool('HIF_sites_invovled_in_looping_not_at_promoter.bed')
 
# extend by 5 kb up/downstream
tss = tss.slop(b=5000, g=pybedtools.chromsizes('hg19'))
 
tss_with_cpg = tss.intersect(cpg, u=True)
tss_without_cpg = tss.intersect(cpg, v=True)
 
# change this to as many CPUs as you have in order to run in parallel
processes = 1
 
# each read will be extended 3' to a total size of this many bp
fragment_size = 200
 
Exemplo n.º 25
0
import numpy as np
import os
import metaseq

ip_filename = metaseq.helpers.example_filename("wgEncodeHaibTfbsK562Atf3V0416101AlnRep1_chr17.bam")
input_filename = metaseq.helpers.example_filename("wgEncodeHaibTfbsK562RxlchV0416101AlnRep1_chr17.bam")

ip_signal = metaseq.genomic_signal(ip_filename, "bam")
input_signal = metaseq.genomic_signal(input_filename, "bam")

# If you already have TSSs, skip this part.
import gffutils

db = gffutils.FeatureDB(metaseq.example_filename("Homo_sapiens.GRCh37.66_chr17.gtf.db"))

import pybedtools
from pybedtools.featurefuncs import TSS
from gffutils.helpers import asinterval


def tss_generator():
    for transcript in db.features_of_type("transcript"):
        yield TSS(asinterval(transcript), upstream=1000, downstream=1000)


if not os.path.exists("tsses.gtf"):
    tsses = pybedtools.BedTool(tss_generator()).saveas("tsses.gtf")
tsses = pybedtools.BedTool("tsses.gtf")

from metaseq import persistence
Exemplo n.º 26
0
 def setup(self):
     self.m = metaseq.genomic_signal(
             metaseq.example_filename('gdc.bigbed'), kind='bigbed')
Exemplo n.º 27
0
if __name__ == "__main__":
    import sys
    choices = ['xcorr', 'chipseq']
    try:
        examples = sys.argv[1:]
    except IndexError:
        print 'Choices are: ', choices
        examples = []

    for ex in examples:
        if ex not in choices:
            raise ValueError('%s not in %s' % (ex, choices))

    if 'xcorr' in examples:
        ip = metaseq.genomic_signal(
            metaseq.example_filename('wgEncodeUwTfbsK562CtcfStdAlnRep1.bam'),
            'bam')

        NWINDOWS = 5000
        FRAGMENT_SIZE = 1
        WINDOWSIZE = 5000
        THRESH = FRAGMENT_SIZE / float(WINDOWSIZE) * 10
        lags, shift = estimate_shift(ip,
                                     nwindows=NWINDOWS,
                                     maxlag=500,
                                     thresh=THRESH,
                                     array_kwargs=dict(
                                         processes=8,
                                         chunksize=100,
                                         fragment_size=FRAGMENT_SIZE),
                                     verbose=True)
def main(data_dir, peak_file, results, sortBy, list_name):

	regions=np.loadtxt(peak_file,dtype={'names': ('chr','start','stop','file'),'formats':('S15','int','int','S15')})['file']
	peaks=BedTool(peak_file)
#	if not os.path.exists(results+"_".join(list_name)+'.npz'):
	print "generating"
	ip_name1 = peak_file.split("/")[-1].split("_")[0]+".bw"
	ip_name2 = peak_file.split("/")[-1].split("_")[1].split("_processed")[0]+".bw"
	print(ip_name1,ip_name2)
	ip_signal_1 = metaseq.genomic_signal(os.path.join(data_dir,ip_name1),'bigwig')
	ip_signal_2 = metaseq.genomic_signal(os.path.join(data_dir,ip_name2),'bigwig')
	ip_array_1 = ip_signal_1.array( peaks, bins=800,processes=processes,method="get_as_array")
	ip_array_2 = ip_signal_2.array( peaks, bins=800,processes=processes,method="get_as_array")
	metaseq.persistence.save_features_and_arrays(features=peaks,arrays={'ip1': ip_array_1,'ip2': ip_array_2},prefix=results+"_".join(list_name),link_features=True,overwrite=True)

	features, arrays = metaseq.persistence.load_features_and_arrays(prefix=results+"_".join(list_name))
	ip1_mean=np.apply_along_axis(np.sum,1,arrays['ip1'][::,300:500])
	ip2_mean=np.apply_along_axis(np.sum,1,arrays['ip2'][::,300:500])
	#print(arrays['ip2'])

	#CFR=(ip1_mean)/(ip2_mean)
	#print(CFR)
	if sortBy == 1:
		ip1_mean_order=np.argsort(ip1_mean)
		path_fig = results+"heatmaps_"+list_name[0]+"_"+list_name[1]+"_sorted_on_"+list_name[0]+".png"
	elif sortBy == 2:
		ip1_mean_order=np.argsort(ip2_mean)
		path_fig = results+"heatmaps_"+list_name[0]+"_"+list_name[1]+"_sorted_on_"+list_name[1]+".png"
	#elif sortBy == 3:
		#ip1_mean_order=np.argsort(CFR)
		#path_fig = results+"heatmaps_"+list_name[0]+"_"+list_name[1]+"_sorted_on_CFR.png"
	
	#print(ip1_mean_order)
	ip1=arrays['ip1'][ip1_mean_order,:][::1]
	ip2=arrays['ip2'][ip1_mean_order,:][::1]
	#CFR_ordered=CFR[ip1_mean_order][::1]
	
	x = np.linspace(-400, 400, 800)
	regions_ordered=regions[ip1_mean_order][::1]
	Vmax=max([ip1.mean(),ip2.mean()])*2
	Vmin=min([ip1.min(),ip2.min()])
	gs = gridspec.GridSpec(2, 2, width_ratios=[8,8],height_ratios=([8,2]))
	gs.update(wspace=0.02, hspace=0.02) 
	
	plt.rcParams['font.family'] = 'Arial'
	plt.rcParams['font.size'] = 10
	fig=plt.figure(figsize=(9,5))
	ax1=fig.add_subplot(gs[0,0])
	ax1.pcolormesh(ip1,vmin=Vmin,vmax=Vmax,cmap=cm.Reds)
	ax1.set_title(list_name[0]+' coverage')
	ax1.xaxis.set_ticklabels([])
	ax1.xaxis.set_ticks_position('none') 
	ax2=fig.add_subplot(gs[0,1])
	ax2.pcolormesh(ip2,vmin=Vmin,vmax=Vmax,cmap=cm.Reds)
	ax2.set_title(list_name[1]+' coverage')
	ax2.yaxis.tick_right()
	ax2.xaxis.set_ticklabels([])
	ax2.xaxis.set_ticks_position('none') 
	#ax3=fig.add_subplot(gs[2])
##	CFR_min=CFR_interval[0]
##	CFR_max=CFR_interval[1]
	
	#CFR_min=CFR.min()
	#CFR_max=CFR.max()
	#pcm=ax3.pcolormesh(np.column_stack(((CFR_ordered),(CFR_ordered))),vmin=CFR_min,vmax=3,cmap=cm.Blues)
	#ax3.set_title('CFR (W='+str(round(CFR_min,2))+', B='+str(round(CFR_max,2))+')')
	#ax3.yaxis.set_ticklabels([])
	#ax3.xaxis.set_ticklabels([])
	
	
#	ax4=fig.add_subplot(gs[3])
#	V_min=Dnase_leaf_ordered.min()
#	V_max=Dnase_leaf_ordered.max()
#	pcm=ax4.pcolormesh(np.column_stack(((Dnase_leaf_ordered),(Dnase_leaf_ordered))),vmin=V_min,vmax=V_max,cmap=cm.Blues)
#	ax4.set_xlabel('Dnase Leaf (W='+str(round(V_min,2))+', B='+str(round(V_max,2))+')')
#	ax4.yaxis.set_ticklabels([])
#	ax4.xaxis.set_ticklabels([])
	
	ax3=fig.add_subplot(gs[1,0])
	ax3.plot(x,ip1.mean(axis=0))
	
	ax4=fig.add_subplot(gs[1,1])
	ax4.plot(x,ip2.mean(axis=0))
	ax4.yaxis.tick_right()
	#fig.colorbar(pcm, extend='max')
	#plt.tight_layout()






	plt.savefig(path_fig)
Exemplo n.º 29
0
import multiprocessing
from matplotlib import pyplot as plt
import matplotlib
import numpy as np
import metaseq
import pybedtools

# Use example data and generate some random features
gs = metaseq.genomic_signal(metaseq.example_filename('x.bam'), 'bam')

features = pybedtools.BedTool()\
    .window_maker(
        b=pybedtools.BedTool('chr2L 0 500000',
                             from_string=True).fn,
        w=1000)\
    .shuffle(seed=1, genome={'chr2L': (0, 5000000)})
genes = []
for i, f in enumerate(features):
    genes.append('gene_%s' % i)
genes = np.array(genes)
arr = gs.array(features, processes=multiprocessing.cpu_count(), bins=100)

# At this point, each item in `genes` corresponds to the same row in `arr`

ind, breaks = metaseq.plotutils.clustered_sortind(arr, k=5)

# Boundaries of clusters are provided in `breaks`.
# So the first cluster's original indices into `arr` are:
cluster_1_inds = ind[0:breaks[0]]

# Which means the genes in the first cluster are:
Exemplo n.º 30
0
# RAG
rag_TLX3 = rel_path + rag_lst["tracks"]["TLX3"][1]
rag_H3K4me1 = rel_path + rag_lst["tracks"]["H3K4me1"][1]
rag_H3K4me2 = rel_path + rag_lst["tracks"]["H3K4me2"][0]
rag_H3K4me3 = rel_path + rag_lst["tracks"]["H3K4me3"][0]
rag_H3K9ac = rel_path + rag_lst["tracks"]["H3K9ac"][0]
rag_H3K9me3 = rel_path + rag_lst["tracks"]["H3K9me3"][0]
rag_H3K27ac = rel_path + rag_lst["tracks"]["H3K27ac"][1]
rag_H3K36me3 = rel_path + rag_lst["tracks"]["H3K36me3"][0]
rag_H3K27me3 = rel_path + rag_lst["tracks"]["H3K27me3"][0]
rag_POLII = rel_path + rag_lst["tracks"]["POLII"][0]

# In[8]:

tlx_TLX3_sig = metaseq.genomic_signal(tlx_TLX3, 'bigwig')
tlx_H3K4me1_sig = metaseq.genomic_signal(tlx_H3K4me1, 'bigwig')
tlx_H3K4me2_sig = metaseq.genomic_signal(tlx_H3K4me2, 'bigwig')
tlx_H3K4me3_sig = metaseq.genomic_signal(tlx_H3K4me3, 'bigwig')
tlx_H3K9ac_sig = metaseq.genomic_signal(tlx_H3K9ac, 'bigwig')
tlx_H3K9me3_sig = metaseq.genomic_signal(tlx_H3K9me3, 'bigwig')
tlx_H3K27ac_sig = metaseq.genomic_signal(tlx_H3K27ac, 'bigwig')
tlx_H3K36me3_sig = metaseq.genomic_signal(tlx_H3K36me3, 'bigwig')
tlx_H3K27me3_sig = metaseq.genomic_signal(tlx_H3K27me3, 'bigwig')
tlx_POLII_sig = metaseq.genomic_signal(tlx_POLII, 'bigwig')

# In[9]:

rag_TLX3_sig = metaseq.genomic_signal(rag_TLX3, 'bigwig')
rag_H3K4me1_sig = metaseq.genomic_signal(rag_H3K4me1, 'bigwig')
rag_H3K4me2_sig = metaseq.genomic_signal(rag_H3K4me2, 'bigwig')
Exemplo n.º 31
0
import numpy as np
import os
import metaseq

ip_filename = metaseq.helpers.example_filename(
    'wgEncodeHaibTfbsK562Atf3V0416101AlnRep1_chr17.bam')
input_filename = metaseq.helpers.example_filename(
    'wgEncodeHaibTfbsK562RxlchV0416101AlnRep1_chr17.bam')

ip_signal = metaseq.genomic_signal(ip_filename, 'bam')
input_signal = metaseq.genomic_signal(input_filename, 'bam')

# If you already have TSSs, skip this part.
import gffutils
db = gffutils.FeatureDB(
    metaseq.example_filename('Homo_sapiens.GRCh37.66_chr17.gtf.db'))

import pybedtools
from pybedtools.featurefuncs import TSS
from gffutils.helpers import asinterval


def tss_generator():
    for transcript in db.features_of_type('transcript'):
        yield TSS(asinterval(transcript), upstream=1000, downstream=1000)

if not os.path.exists('tsses.gtf'):
    tsses = pybedtools.BedTool(tss_generator()).saveas('tsses.gtf')
tsses = pybedtools.BedTool('tsses.gtf')

from metaseq import persistence
Exemplo n.º 32
0
def filterPossibleMatches(bigWigFiles, bigWigList, metaprofile, opPrefix,
                          currSignal, minWidth, maxWidth):

    currRegions = pbt.BedTool(opPrefix + "_" + currSignal + "_MFpositives.bed")

    signalList = getSignals(bigWigFiles, bigWigList, currRegions, currSignal)
    del bigWigList[currSignal]
    bigWigList[currSignal] = metaseq.genomic_signal(bigWigFiles[currSignal],
                                                    "bigWig")
    print len(signalList)
    binSize = 25

    op = open(opPrefix + "_" + currSignal + "_tentPositives.bed", "w")
    newWidth = minWidth
    ipFiles = OrderedDict()
    while newWidth <= maxWidth:
        ipFiles[newWidth] = open(
            opPrefix + "_pValue_" + str(newWidth) + ".bed", "r")
        newWidth += binSize

    for idx in range(0, len(signalList)):
        if idx % 1000 == 0:
            print idx
        region = currRegions[idx]
        smoothedSignal = smoothSignals(signalList[idx], win=10)
        maximaIndices = getMaxima(smoothedSignal)
        if len(maximaIndices) < 2:
            continue
        pairings = findPossiblePairings(maximaIndices, binSize, minWidth,
                                        maxWidth)
        for currPairing in pairings:
            newWidth = binSize * (currPairing[1] - currPairing[0])
            if newWidth < minWidth or newWidth > maxWidth:
                continue
            currStart = np.floor((region.start - 500) +
                                 (currPairing[0] * binSize) - 5.0 /
                                 (len(metaprofile) - 10) * newWidth) + 25
            currEnd = np.floor(region.start - 500 + currPairing[1] * binSize +
                               5.0 / (len(metaprofile) - 10) * newWidth)
            currChr = region.chrom

            for line in ipFiles[newWidth]:
                if "#" in line:
                    headerFields = line.strip().split("\t")
                    currSignalIdx = headerFields.index(currSignal)
                    continue
                fields = line.strip().split("\t")
                start = int(fields[1])
                chrom = fields[0]
                end = int(fields[2])
                if start > currStart + binSize:
                    break
                if start <= currStart and end >= currEnd:
                    #accepting this enhancer if it passes cutoff
                    if float(fields[currSignalIdx]) <= 0.001:
                        op.write(chrom + "\t" + str(start) + "\t" + str(end) +
                                 "\t" + str(fields[currSignalIdx]) + "\t" +
                                 str(newWidth) + "\n")
                        break
    newWidth = minWidth
    while newWidth <= maxWidth:
        ipFiles[newWidth].close()
        newWidth += binSize
    op.close()
    os.system("sortBed -i " + opPrefix + "_" + currSignal +
              "_tentPositives.bed > " + opPrefix + "_" + currSignal +
              "_tentPositives2.bed")
    ip = open(opPrefix + "_" + currSignal + "_tentPositives2.bed", "r")
    op = open(opPrefix + "_" + currSignal + "_finalPositives.bed", "w")
    currStart = currEnd = currPvalue = currWidth = 0
    currChrom = ""
    for line in ip:
        fields = line.strip().split("\t")
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        pValue = float(fields[3])
        width = int(fields[-1])
        if chrom == currChrom and start <= currStart and end >= currEnd:
            currStart = start
            currEnd = end
            currPvalue = pValue
            currWidth = width
            currChrom = chrom
        elif chrom == currChrom and currStart <= start and currEnd >= end:
            continue
        else:
            if currChrom != "":
                op.write(currChrom + "\t" + str(currStart) + "\t" +
                         str(currEnd) + "\t" + str(currPvalue) + "\t" +
                         str(currWidth) + "\n")
            currStart = start
            currEnd = end
            currPvalue = pValue
            currWidth = width
            currChrom = chrom
    op.close()
    ip.close()
    os.system("rm " + opPrefix + "_" + currSignal + "_tentPositives2.bed " +
              opPrefix + "_" + currSignal + "_tentPositives.bed")
    return
Exemplo n.º 33
0
#~ Metaseq #
# ---------------------------------------------------------

#~ metaseq works with the concepts of signal and windows. In this example, the signal is ChIP data, and the windows are TSS +/- 1kb.

#~ The first step is to create “genomic signal” objects out of the data. Since our example files are BAM files, we specify the kind=’bam’, but if you have your own data in a different format (bigWig, bigBed, BED, GFF, GTF, VCF) then specify that format instead (see metaseq.genomic_signal()).

import metaseq

ip_file = 'TLX3_H3K4me3_repl1.sorted.bam'
#ip_file = 'TLX3_H3K4me3_repl2.sorted.bam'

input_file = 'INP-TLX3_1.sorted.bam'
#input_file = 'INP-TLX3_2.sorted.bam'

ip_signal = metaseq.genomic_signal(os.path.join(data_dir, ip_file), 'bam')

input_signal = metaseq.genomic_signal(os.path.join(data_dir, input_file),
                                      'bam')

#~ Now we can create the arrays of signal over each window. Since this can be a time-consuming step, the first time this code is run it will cache the arrays on disk. The next time this code is run, it will be quickly loaded. Trigger a re-run by deleting the .npz file.

#~ Here, with the BamSignal.array method, we bin each promoter region into 100 bins, and calculate the signal in parallel across as many CPUs as are available. We do this for the IP signal and input signals separately. Then, since these are BAM files of mapped reads, we scale the arrays to the library size. The scaled arrays are then saved to disk, along with the windows that were used to create them.

import multiprocessing
processes = multiprocessing.cpu_count()

if not os.path.exists('example.npz'):

    # The signal is the IP ChIP-seq BAM file.
    ip_array = ip_signal.array(
Exemplo n.º 34
0
    if req not in allowed:
        raise ValueError("%s not in %s" % (req, allowed))

plt.rcParams['font.size'] = 10

intervals = pybedtools.BedTool().window_maker(
    genome={args.chrom: (args.start, args.stop)}, n=args.nfeatures)\
    .shuffle(genome={args.chrom: (args.start, args.stop)}, seed=1)

size = (args.stop - args.start) / args.nfeatures
if args.type == 'all':
    requested = ['bigwig', 'bam', 'bigbed', 'bed']
signals = []
for req in requested:
    if req != 'bed':
        signals.append(metaseq.genomic_signal(args.prefix + '.' + req, req))
    else:
        signals.append(metaseq.genomic_signal(args.prefix + '.bed.gz', 'bed'))

files = '* ' + '\n  * '.join([i.fn for i in signals])
plot_filename = (args.plot_prefix + '-%s_features-%s_bp_chunksize=%s.pdf'
                 % (args.nfeatures, size, args.chunksize))
processes = range(1, multiprocessing.cpu_count() + 1)
max_proc = processes[-1]
print """
{usage}
Parameters
----------
This script will generate {args.nfeatures} random features, each about {size}
bp, from genomic coordinates {args.chrom}:{args.start}-{args.stop}.
Exemplo n.º 35
0
if __name__ == "__main__":
    import sys
    choices = ['xcorr', 'chipseq']
    try:
        examples = sys.argv[1:]
    except IndexError:
        print 'Choices are: ', choices
        examples = []

    for ex in examples:
        if ex not in choices:
            raise ValueError('%s not in %s' % (ex, choices))

    if 'xcorr' in examples:
        ip = metaseq.genomic_signal(
            metaseq.example_filename(
                'wgEncodeUwTfbsK562CtcfStdAlnRep1.bam'), 'bam')

        NWINDOWS = 5000
        FRAGMENT_SIZE = 1
        WINDOWSIZE = 5000
        THRESH = FRAGMENT_SIZE / float(WINDOWSIZE) * 10
        lags, shift = estimate_shift(
            ip, nwindows=NWINDOWS, maxlag=500, thresh=THRESH,
            array_kwargs=dict(
                processes=8, chunksize=100,
                fragment_size=FRAGMENT_SIZE),
            verbose=True)
        plt.plot(lags, shift.mean(axis=0))
        plt.axvline(
            lags[np.argmax(shift.mean(axis=0))],
Exemplo n.º 36
0
def main(histoneFile, MPRApeakFile, nonrandomFile, opPrefix, pp, otherMarkFile=None, plotChar=True):
	"""
	The main function of the program. 
	Calculates the pattern of troughs between STARR-seq peaks.
	Args:
		histoneFile: bigWigfile of histone signal
		MPRApeaks: contains peaks from massively parallel reporter assay for regulatory regions
		nonrandomFile: contains regions on which negatives should not intersect
		opPrefix: output prefix for all output files
	Returns:
		statistics for output
	"""

	#Checking input file
	try:
		MPRApeaks = pbt.BedTool(MPRApeakFile)
	except:
		sys.stderr.write("ERROR: Cannot open MPRA peak file " + MPRApeakFile + "\n")
		sys.exit()
	try:
		histoneSignal = metaseq.genomic_signal(histoneFile, "bigWig")
	except:
		sys.stderr.write("ERROR: Cannot open histone signal file " + histoneFile + "\n")
		sys.exit()
	try:
		nrFile = open(nonrandomFile, "r")
	except:
		sys.stderr.write("ERROR: " + nonrandomFile + " does not open\n")
		sys.exit()

	#Checking output files
	try:
		op = open(opPrefix + "_doublePeakStats.txt", "w")
		op2 = open(opPrefix + "_doublePeak.bed", "w")
	except:
		sys.stderr.write("ERROR: Cannot create output files\n")
		sys.exit()

	#Read nonrandom intervals
	nonrandom = bed.Bed()
	for line in nrFile:
		currInterval = extractFeature(line.rstrip())
		nonrandom.features.append(currInterval)
		del currInterval
	nonrandom.sortByChromosomeAndStartAndEnd()

	#Getting total number of 25 bp intervals that can be negative
	numNegativesTotal = calculateNumberNegatives(nonrandomFile, histoneFile, opPrefix)
	print "Number of negatives are", numNegativesTotal
	del histoneFile
	
	smoothingWindow = 2
	numberDoublePeaks = 0
	numberMinima = 0
	metaIntersectingIntervals = []
	metaMinima = []
	metaMaxima1 = []
	metaMaxima2 = []
	op.write("Maxima1\tMinima\tMaxima2\tdistanceMaxima1\tdistanceMaxima2\tratioMaxima\tratioMaximaMinima\tdistancePeakMinima\n")
	doublePeakRegions = []
	filteredPositives = getDoublePeakRegions(MPRApeaks, histoneSignal, metaMaxima1, metaMaxima2, metaMinima, op, op2, metaIntersectingIntervals, doublePeakRegions, otherMarkFile)
	op.close()
	op2.close()

	#Getting shape characteristics
	allShapeCharacteristics = []
	readShapeCharacteristics(allShapeCharacteristics, opPrefix + "_doublePeakStats.txt")
	if plotChar:
		plotCharacteristics(allShapeCharacteristics, pp)
	length1 = []
	length2 = []
	for idx in range(0, len(metaMaxima1)):
		length1.append(metaMinima[idx] - metaMaxima1[idx])
		length2.append(metaMaxima2[idx] - metaMinima[idx])
		if length1[-1] <= 0 or length2[-1] <= 0:
			print metaMaxima1[idx], metaMinima[idx], metaMaxima2[idx]
	
	#Calculating metaprofile
	try:
		op = open(opPrefix + "_metaProfile.dat", "w")
		op2 = open(opPrefix + "_asymProfile.dat", "w")
	except:
		sys.stderr.write("ERROR: Could not open " + opPrefix + "_metaProfile.dat\n")
		sys.exit()
	bins = 2 * max([max(length1), max(length2)])
	smoothedMetaProfile = calculateMetaProfile(filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, op, op2)
	op.close()
	op2.close()

	if otherMarkFile != None:
		calculateDependentProfile(otherMarkFile, filteredPositives, metaIntersectingIntervals, bins, metaMaxima1, metaMaxima2, metaMinima, pp, opPrefix)

	return
Exemplo n.º 37
0
"""
module for testing the larger files (x.bam, x.bed.gz, etc)
"""
import multiprocessing
import metaseq
import pybedtools

CPUS = multiprocessing.cpu_count()

gs = {}
for kind in ['bam', 'bigwig', 'bed', 'bigbed']:
    if kind == 'bed':
        ext = 'bed.gz'
    else:
        ext = kind
    gs[kind] = metaseq.genomic_signal(
        metaseq.example_filename('x.%s' % ext), kind)

# generate the test features
features = pybedtools.BedTool()\
        .window_maker(
            b=pybedtools.BedTool('chr2L 0 500000',
                                 from_string=True).fn,
            w=1000)\
        .shuffle(seed=1,
                 genome={'chr2L': (0, 5000000)})

args = (features,)
kwargs = dict(processes=CPUS, bins=100)
bam_array = gs['bam'].array(*args, **kwargs)
bed_array = gs['bed'].array(*args, **kwargs)
bw_array = gs['bigwig'].array(*args, method='get_as_array', **kwargs)
Exemplo n.º 38
0
def make_vplot(bam_file, tss, prefix, genome, read_len, bins=400, bp_edge=2000,
               processes=8, greenleaf_norm=True):
    '''
    Take bootstraps, generate V-plots, and get a mean and
    standard deviation on the plot. Produces 2 plots. One is the
    aggregation plot alone, while the other also shows the signal
    at each TSS ordered by strength.
    '''
    vplot_file = '{0}_vplot.png'.format(prefix)
    vplot_large_file = '{0}_large_vplot.png'.format(prefix)

    # Load the TSS file
    tss = pybedtools.BedTool(tss)
    tss_ext = tss.slop(b=bp_edge, g=genome)

    # Load the bam file
    bam = metaseq.genomic_signal(bam_file, 'bam') # Need to shift reads and just get ends, just load bed file?
    bam_array = bam.array(tss_ext, bins=bins, shift_width = -read_len/2, # Shift to center the read on the cut site
                          processes=processes, stranded=True)

    # Actually first build an "ends" file
    #get_ends = '''zcat {0} | awk -F '\t' 'BEGIN {{OFS="\t"}} {{if ($6 == "-") {{$2=$3-1; print}} else {{$3=$2+1; print}} }}' | gzip -c > {1}_ends.bed.gz'''.format(bed_file, prefix)
    #print(get_ends)
    #os.system(get_ends)

    #bed_reads = metaseq.genomic_signal('{0}_ends.bed.gz'.format(prefix), 'bed')
    #bam_array = bed_reads.array(tss_ext, bins=bins,
    #                      processes=processes, stranded=True)

    # Normalization (Greenleaf style): Find the avg height
    # at the end bins and take fold change over that
    if greenleaf_norm:
        # Use enough bins to cover 100 bp on either end
        num_edge_bins = int(100/(2*bp_edge/bins))
        bin_means = bam_array.mean(axis=0)
        avg_noise = (sum(bin_means[:num_edge_bins]) +
                     sum(bin_means[-num_edge_bins:]))/(2*num_edge_bins)
        bam_array /= avg_noise
    else:
        bam_array /= bam.mapped_read_count() / 1e6

    # Generate a line plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    x = np.linspace(-bp_edge, bp_edge, bins)

    ax.plot(x, bam_array.mean(axis=0), color='r', label='Mean')
    ax.axvline(0, linestyle=':', color='k')

    # Note the middle high point (TSS)
    tss_point_val = max(bam_array.mean(axis=0))

    ax.set_xlabel('Distance from TSS (bp)')
    ax.set_ylabel('Average read coverage (per million mapped reads)')
    ax.legend(loc='best')

    fig.savefig(vplot_file)

    # Print a more complicated plot with lots of info

    # Find a safe upper percentile - we can't use X if the Xth percentile is 0
    upper_prct = 99
    if mlab.prctile(bam_array.ravel(), upper_prct) == 0.0:
        upper_prct = 100.0

    plt.rcParams['font.size'] = 8
    fig = metaseq.plotutils.imshow(bam_array,
                                   x=x,
                                   figsize=(5, 10),
                                   vmin=5, vmax=upper_prct, percentile=True,
                                   line_kwargs=dict(color='k', label='All'),
                                   fill_kwargs=dict(color='k', alpha=0.3),
                                   sort_by=bam_array.mean(axis=1))

    # And save the file
    fig.savefig(vplot_large_file)

    return vplot_file, vplot_large_file, tss_point_val
Exemplo n.º 39
0
def filterPossibleMatches(bigWigFiles, bigWigList, metaprofile, opPrefix, currSignal, minWidth, maxWidth):
	
	currRegions = pbt.BedTool(opPrefix + "_" + currSignal + "_MFpositives.bed")
	
	signalList = getSignals(bigWigFiles, bigWigList, currRegions, currSignal)
	del bigWigList[currSignal] 
	bigWigList[currSignal] =  metaseq.genomic_signal(bigWigFiles[currSignal], "bigWig")
	print len(signalList)
	binSize = 25

	op = open(opPrefix + "_" + currSignal + "_tentPositives.bed", "w")
	newWidth = minWidth
	ipFiles = OrderedDict()
	while newWidth <= maxWidth:
		ipFiles[newWidth] = open(opPrefix + "_pValue_" + str(newWidth) + ".bed", "r")
		newWidth += binSize 
	
	for idx in range(0, len(signalList)):
		if idx % 1000 ==0:
			print idx
		region = currRegions[idx]
		smoothedSignal = smoothSignals(signalList[idx], win=10)
		maximaIndices = getMaxima(smoothedSignal)
		if len(maximaIndices) < 2:
			continue
		pairings = findPossiblePairings(maximaIndices, binSize, minWidth, maxWidth)
		for currPairing in pairings:
			newWidth = binSize * (currPairing[1] - currPairing[0])
			if newWidth < minWidth or newWidth > maxWidth:
				continue
			currStart = np.floor((region.start - 500) + (currPairing[0] * binSize) - 5.0/(len(metaprofile) - 10) * newWidth) + 25
			currEnd = np.floor(region.start - 500 + currPairing[1] * binSize + 5.0/(len(metaprofile) - 10) * newWidth)
			currChr = region.chrom
			
			for line in ipFiles[newWidth]:
				if "#" in line:
					headerFields = line.strip().split("\t")
					currSignalIdx = headerFields.index(currSignal)
					continue
				fields = line.strip().split("\t")
				start = int(fields[1])
				chrom = fields[0]
				end = int(fields[2])
				if start > currStart + binSize:
					break
				if start <= currStart and end >= currEnd:
					#accepting this enhancer if it passes cutoff
					if float(fields[currSignalIdx]) <= 0.001:
						op.write(chrom + "\t" + str(start) + "\t" + str(end) + "\t" + str(fields[currSignalIdx]) + "\t" + str(newWidth) + "\n")
						break
	newWidth = minWidth
	while newWidth <= maxWidth:
		ipFiles[newWidth].close()
		newWidth += binSize 
	op.close()
	os.system("sortBed -i " + opPrefix + "_" + currSignal + "_tentPositives.bed > " + opPrefix + "_" + currSignal + "_tentPositives2.bed")
	ip = open(opPrefix + "_" + currSignal + "_tentPositives2.bed" ,"r")
	op = open(opPrefix + "_" + currSignal + "_finalPositives.bed", "w")
	currStart = currEnd = currPvalue = currWidth = 0
	currChrom = ""
	for line in ip:
		fields = line.strip().split("\t")
		chrom = fields[0]
		start = int(fields[1])
		end = int(fields[2])
		pValue = float(fields[3])
		width = int(fields[-1])
		if chrom == currChrom and start <= currStart and end >= currEnd:
			currStart = start 
			currEnd = end
			currPvalue = pValue
			currWidth = width
			currChrom = chrom
		elif chrom == currChrom and currStart <= start and currEnd >= end:
			continue
		else:
			if currChrom != "":
				op.write(currChrom + "\t" + str(currStart) + "\t" + str(currEnd) + "\t" + str(currPvalue) + "\t" + str(currWidth) + "\n")
			currStart = start 
			currEnd = end
			currPvalue = pValue
			currWidth = width
			currChrom = chrom
	op.close()
	ip.close()
	os.system("rm " + opPrefix + "_" + currSignal + "_tentPositives2.bed " + opPrefix + "_" + currSignal + "_tentPositives.bed")
	return
import multiprocessing

from matplotlib import pyplot as plt

tss_annotation = str(sys.argv[1])
bam_file = str(sys.argv[2])
slop_region = str(sys.argv[3])

bins = str(sys.argv[4])

#read in tss annotation file in bed format
tss_bed = pybedtools.BedTool(tss_annotation)

# extend by 1000 bp up/downstream
tss_slop = tss_bed.slop(b=1000, genome=annotationDir + 'chromLength')

bam_gsignal = metaseq.genomic_signal(bam_file, 'bam')

# the region +/-500bp around each TSS will be split into a total of 100 bins,
# change as needed

x = np.linspace(-1000, 1000, bins)

# most of the work happens here
test1_tss = test1_bam.array(tss_slop, bins=bins, processes=cpus)
test2_tss = test2_bam.array(tss_slop, bins=bins, processes=cpus)
bc1_tss = bc1_bam.array(tss_slop, bins=bins, processes=cpus)
bc2_tss = bc2_bam.array(tss_slop, bins=bins, processes=cpus)
mg_tss = mg_bam.array(tss_slop, bins=bins, processes=cpus)
tp_tss = tp_bam.array(tss_slop, bins=bins, processes=cpus)
Exemplo n.º 41
0
 def setup(self):
     self.m = metaseq.genomic_signal(metaseq.example_filename('gdc.bigbed'),
                                     kind='bigbed')
Exemplo n.º 42
0
from gffutils.helpers import asinterval


def tss_generator():
    for transcript in db.features_of_type('mRNA'):  #CDS/gene/mRNA...
        yield TSS(asinterval(transcript), upstream=1, downstream=0)


tsses = pybedtools.BedTool(tss_generator()).saveas('tsses.gtf')
tsses_1kb = tsses.slop(b=1000, genome='hg19',
                       output='tsses-1kb.gtf')  #疑问是基因组文件

import metaseq

ip_signal = metaseq.genomic_signal('WT_H2A_Z.sort.bam',
                                   'bam')  #输入chip-seq比对后的bam文件

import multiprocessing

processes = multiprocessing.cpu_count()

ip_array = ip_signal.array(

    # Look at signal over these windows
    tsses_1kb,

    # Bin signal into this many bins per window
    bins=100,

    # Use multiple CPUs. Dramatically speeds up run time.
    processes=processes)
Exemplo n.º 43
0
        bedtool = pybedtools.BedTool(self.bed)
        features = bedtool.intersect([feature], u=True)
        track = Track(features)
        ax.add_collection(track)
        # ax.axis('tight')
        return feature


if __name__ == "__main__":
    import metaseq
    import gffutils
    import pybedtools

    G = gffutils.FeatureDB(metaseq.example_filename("Homo_sapiens.GRCh37.66.cleaned.gtf.db"))

    ip = metaseq.genomic_signal(metaseq.example_filename("wgEncodeUwTfbsK562CtcfStdAlnRep1.bam"), "bam")
    inp = metaseq.genomic_signal(metaseq.example_filename("wgEncodeUwTfbsK562InputStdAlnRep1.bam"), "bam")
    peaks = pybedtools.BedTool(metaseq.example_filename("wgEncodeUwTfbsK562CtcfStdPkRep1.narrowPeak.gz"))

    plotting_kwargs = [dict(color="r", label="IP"), dict(color="k", linestyle=":", label="input")]

    local_coverage_kwargs = dict(fragment_size=200)

    b = SignalMiniBrowser([ip, inp], plotting_kwargs=plotting_kwargs, local_coverage_kwargs=local_coverage_kwargs)

    g = GeneModelMiniBrowser([ip, inp], G, plotting_kwargs=plotting_kwargs, local_coverage_kwargs=local_coverage_kwargs)

    p = PeakMiniBrowser([ip, inp], peaks, plotting_kwargs=plotting_kwargs, local_coverage_kwargs=local_coverage_kwargs)

    feature = peaks[3]
Exemplo n.º 44
0
def get_bigwig(name):
    bw = bigwigs.get(name, None)
    if bw is None:
        bw = metaseq.genomic_signal(name,'bigwig')
        bigwigs[name] = bw
    return bw
Exemplo n.º 45
0
def make_tss_plot(bam_file, tss, prefix, chromsizes, read_len, bins=400, bp_edge=2000,
                  processes=8, greenleaf_norm=True):
    '''
    Take bootstraps, generate tss plots, and get a mean and
    standard deviation on the plot. Produces 2 plots. One is the
    aggregation plot alone, while the other also shows the signal
    at each TSS ordered by strength.
    '''
    logging.info('Generating tss plot...')
    tss_plot_file = '{0}_tss-enrich.png'.format(prefix)
    tss_plot_data_file = '{0}_tss-enrich.txt'.format(prefix)
    tss_plot_large_file = '{0}_large_tss-enrich.png'.format(prefix)

    # Load the TSS file
    tss = pybedtools.BedTool(tss)
    tss_ext = tss.slop(b=bp_edge, g=chromsizes)

    # Load the bam file
    # Need to shift reads and just get ends, just load bed file?
    bam = metaseq.genomic_signal(bam_file, 'bam')
    bam_array = bam.array(tss_ext, bins=bins, shift_width=-read_len / 2,  # Shift to center the read on the cut site
                          processes=processes, stranded=True)

    # Actually first build an "ends" file
    #get_ends = '''zcat {0} | awk -F '\t' 'BEGIN {{OFS="\t"}} {{if ($6 == "-") {{$2=$3-1; print}} else {{$3=$2+1; print}} }}' | gzip -c > {1}_ends.bed.gz'''.format(bed_file, prefix)
    # print(get_ends)
    # os.system(get_ends)

    #bed_reads = metaseq.genomic_signal('{0}_ends.bed.gz'.format(prefix), 'bed')
    # bam_array = bed_reads.array(tss_ext, bins=bins,
    #                      processes=processes, stranded=True)

    # Normalization (Greenleaf style): Find the avg height
    # at the end bins and take fold change over that
    if greenleaf_norm:
        # Use enough bins to cover 100 bp on either end
        num_edge_bins = int(100 / (2 * bp_edge / bins))
        bin_means = bam_array.mean(axis=0)
        avg_noise = (sum(bin_means[:num_edge_bins]) +
                     sum(bin_means[-num_edge_bins:]  ))/(2*num_edge_bins)
        bam_array /= avg_noise
    else:
        bam_array /= bam.mapped_read_count() / 1e6

    # Generate a line plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    x = np.linspace(-bp_edge, bp_edge, bins)

    ax.plot(x, bam_array.mean(axis=0), color='r', label='Mean')
    ax.axvline(0, linestyle=':', color='k')

    # Note the middle high point (TSS)
    tss_point_val = max(bam_array.mean(axis=0))

    ax.set_xlabel('Distance from TSS (bp)')
    ax.set_ylabel('Average read coverage (per million mapped reads)')
    ax.legend(loc='best')

    fig.savefig(tss_plot_file)

    # Print a more complicated plot with lots of info

    # write the plot data; numpy object
    np.savetxt(tss_plot_data_file, bam_array.mean(axis=0), delimiter=",")

    # Find a safe upper percentile - we can't use X if the Xth percentile is 0
    upper_prct = 99
    if mlab.prctile(bam_array.ravel(), upper_prct) == 0.0:
        upper_prct = 100.0

    plt.rcParams['font.size'] = 8
    fig = metaseq.plotutils.imshow(bam_array,
                                   x=x,
                                   figsize=(5, 10),
                                   vmin=5, vmax=upper_prct, percentile=True,
                                   line_kwargs=dict(color='k', label='All'),
                                   fill_kwargs=dict(color='k', alpha=0.3),
                                   sort_by=bam_array.mean(axis=1))

    # And save the file
    fig.savefig(tss_plot_large_file)

    return tss_plot_file, tss_plot_large_file, tss_point_val
Exemplo n.º 46
0
#~ tsses_2kb = tsses.slop(b=2000, genome='mm9', output=join(ref_dir,'mm9_tsses-2kb.gtf'))
#~ tsses_3kb = tsses.slop(b=3000, genome='mm9', output=join(ref_dir,'mm9_tsses-3kb.gtf'))
#~ tsses_5kb = tsses.slop(b=5000, genome='mm9', output=join(ref_dir,'mm9_tsses-5kb.gtf'))

tsses_1kb = pb.BedTool(join(ref_dir,'mm9_tsses-1kb.gtf'))


# ---------------------------------------------------------
#~ Metaseq #
# ---------------------------------------------------------

#~ metaseq works with the concepts of signal and windows. In this example, the signal is ChIP data, and the windows are TSS +/- 1kb.

#~ The first step is to create "genomic signal" objects out of the data. Since our example files are BAM files, we specify the kind='bam', but if you have your own data in a different format (bigWig, bigBed, BED, GFF, GTF, VCF) then specify that format instead (see metaseq.genomic_signal()).

#~ tlx_bdg = rel_path+tlx_lst["tracks"][0]
#~ rag_bdg = rel_path+rag_lst["tracks"][0]

tlx_bw = rel_path+tlx_lst["tracks"][2]
rag_bw = rel_path+rag_lst["tracks"][2]

print tlx_bw, rag_bw



tlx_signal = metaseq.genomic_signal(tlx_bw,'bigwig')
rag_signal = metaseq.genomic_signal(rag_bw,'bigwig')



Exemplo n.º 47
0
"""
Many of these tests use the minimal test/data/gdc.bed file which has just
enough complexity to be useful in testing corner cases.  When reading through
the tests, it's useful to have that file open to understand what's happening.
"""
import os
import metaseq
import multiprocessing
from metaseq.array_helpers import ArgumentError
import numpy as np
from nose.tools import assert_raises
from nose.plugins.skip import SkipTest
gs = {}
for kind in ['bed', 'bam', 'bigbed', 'bigwig']:
    gs[kind] = metaseq.genomic_signal(metaseq.example_filename('gdc.%s' % kind), kind)

PROCESSES = int(os.environ.get("METASEQ_PROCESSES", multiprocessing.cpu_count()))

def test_tointerval():
    assert metaseq.helpers.tointerval("chr2L:1-10[-]").strand == '-'
    assert metaseq.helpers.tointerval("chr2L:1-10[+]").strand == '+'
    assert metaseq.helpers.tointerval("chr2L:1-10").strand == '.'


def test_local_count():

    def check(kind, coord, expected, stranded):
        try:
            result = gs[kind].local_count(coord, stranded=stranded)
        except NotImplementedError:
            raise SkipTest("Incompatible bx-python version for bigBed")
Exemplo n.º 48
0
from pybedtools.featurefuncs import TSS

from gffutils.helpers import asinterval


def tss_generator():
    for transcript in db.features_of_type('mRNA'):  #CDS/gene/mRNA...
        yield TSS(asinterval(transcript), upstream=1, downstream=0)


tsses = pybedtools.BedTool(tss_generator()).saveas('tsses.gtf')
tsses_1kb = tsses.slop(b=1000, genome='hg19',
                       output='tsses-1kb.gtf')  #疑问是基因组文件

import metaseq
ip_signal = metaseq.genomic_signal('arp6_H2A_Z.sort.bam',
                                   'bam')  #输入突变体chip-seq比对后的bam文件
input_signal = metaseq.genomic_signal('WT_H2A_Z.sort.bam', 'bam')  #输入野生型的bam文件

import multiprocessing
processes = multiprocessing.cpu_count()

ip_array = ip_signal.array(

    # Look at signal over these windows
    tsses_1kb,

    # Bin signal into this many bins per window
    bins=100,

    # Use multiple CPUs. Dramatically speeds up run time.
    processes=processes)
Exemplo n.º 49
0
def main(histoneFile,
         MPRApeakFile,
         nonrandomFile,
         opPrefix,
         pp,
         otherMarkFile=None,
         plotChar=True):
    """
	The main function of the program. 
	Calculates the pattern of troughs between STARR-seq peaks.
	Args:
		histoneFile: bigWigfile of histone signal
		MPRApeaks: contains peaks from massively parallel reporter assay for regulatory regions
		nonrandomFile: contains regions on which negatives should not intersect
		opPrefix: output prefix for all output files
	Returns:
		statistics for output
	"""

    #Checking input file
    try:
        MPRApeaks = pbt.BedTool(MPRApeakFile)
    except:
        sys.stderr.write("ERROR: Cannot open MPRA peak file " + MPRApeakFile +
                         "\n")
        sys.exit()
    try:
        histoneSignal = metaseq.genomic_signal(histoneFile, "bigWig")
    except:
        sys.stderr.write("ERROR: Cannot open histone signal file " +
                         histoneFile + "\n")
        sys.exit()
    try:
        nrFile = open(nonrandomFile, "r")
    except:
        sys.stderr.write("ERROR: " + nonrandomFile + " does not open\n")
        sys.exit()

    #Checking output files
    try:
        op = open(opPrefix + "_doublePeakStats.txt", "w")
        op2 = open(opPrefix + "_doublePeak.bed", "w")
    except:
        sys.stderr.write("ERROR: Cannot create output files\n")
        sys.exit()

    #Read nonrandom intervals
    nonrandom = bed.Bed()
    for line in nrFile:
        currInterval = extractFeature(line.rstrip())
        nonrandom.features.append(currInterval)
        del currInterval
    nonrandom.sortByChromosomeAndStartAndEnd()

    #Getting total number of 25 bp intervals that can be negative
    numNegativesTotal = calculateNumberNegatives(nonrandomFile, histoneFile,
                                                 opPrefix)
    print "Number of negatives are", numNegativesTotal
    del histoneFile

    smoothingWindow = 2
    numberDoublePeaks = 0
    numberMinima = 0
    metaIntersectingIntervals = []
    metaMinima = []
    metaMaxima1 = []
    metaMaxima2 = []
    op.write(
        "Maxima1\tMinima\tMaxima2\tdistanceMaxima1\tdistanceMaxima2\tratioMaxima\tratioMaximaMinima\tdistancePeakMinima\n"
    )
    doublePeakRegions = []
    filteredPositives = getDoublePeakRegions(MPRApeaks, histoneSignal,
                                             metaMaxima1, metaMaxima2,
                                             metaMinima, op, op2,
                                             metaIntersectingIntervals,
                                             doublePeakRegions, otherMarkFile)
    op.close()
    op2.close()

    #Getting shape characteristics
    allShapeCharacteristics = []
    readShapeCharacteristics(allShapeCharacteristics,
                             opPrefix + "_doublePeakStats.txt")
    if plotChar:
        plotCharacteristics(allShapeCharacteristics, pp)
    length1 = []
    length2 = []
    for idx in range(0, len(metaMaxima1)):
        length1.append(metaMinima[idx] - metaMaxima1[idx])
        length2.append(metaMaxima2[idx] - metaMinima[idx])
        if length1[-1] <= 0 or length2[-1] <= 0:
            print metaMaxima1[idx], metaMinima[idx], metaMaxima2[idx]

    #Calculating metaprofile
    try:
        op = open(opPrefix + "_metaProfile.dat", "w")
        op2 = open(opPrefix + "_asymProfile.dat", "w")
    except:
        sys.stderr.write("ERROR: Could not open " + opPrefix +
                         "_metaProfile.dat\n")
        sys.exit()
    bins = 2 * max([max(length1), max(length2)])
    smoothedMetaProfile = calculateMetaProfile(filteredPositives,
                                               metaIntersectingIntervals, bins,
                                               metaMaxima1, metaMaxima2,
                                               metaMinima, pp, op, op2)
    op.close()
    op2.close()

    if otherMarkFile != None:
        calculateDependentProfile(otherMarkFile, filteredPositives,
                                  metaIntersectingIntervals, bins, metaMaxima1,
                                  metaMaxima2, metaMinima, pp, opPrefix)

    return
Exemplo n.º 50
0
                if score != 0:
                    fout.write('\t'.join([
                        feature.chrom,
                        str(start),
                        str(stop),
                        str(score)]) + '\n')
                start = start + binsize
        this_batch = []
        i = 0
    fout.close()


if __name__ == "__main__":
    import metaseq
    ip_bam = metaseq.genomic_signal(
            metaseq.example_filename(
                'wgEncodeUwTfbsK562CtcfStdAlnRep1.bam'), 'bam')
    control_bam = metaseq.genomic_signal(
            metaseq.example_filename(
                'wgEncodeUwTfbsK562InputStdAlnRep1.bam'), 'bam')

    BINSIZE = 10
    WINDOWSIZE = 10000
    BINS = WINDOWSIZE / BINSIZE
    features = pybedtools.BedTool()\
            .window_maker(genome='hg19', w=WINDOWSIZE)\
            .filter(lambda x: x.chrom == 'chr19')

    result = compare(
            signal1=ip_bam,
            signal2=control_bam,