def getMergedBigWigPeakShift(infiles, outfile): '''Merge multiple BAM files per replicate to produce a single peak-shifted bigwig file''' expt = P.snip(os.path.basename(outfile), ".merge.bw").replace("-agg", "") in_list = " --bamfile=".join(infiles) offsets = [] for t in infiles: track = P.snip(os.path.basename(t), ".norm.bam") fn = "macs/with_input/%s.macs" % track if os.path.exists(fn): offsets.append(str(PIntervals.getPeakShiftFromMacs(fn))) shifts = " --shift=".join(offsets) statement = '''python %(scriptsdir)s/bam2wiggle.py --output-format=bigwig %(in_list)s %(shifts)s > %(outfile)s''' P.run()
def getMergedBigWigPeakShift( infiles, outfile ): '''Merge multiple BAM files per replicate to produce a single peak-shifted bigwig file''' expt = P.snip( os.path.basename( outfile ), ".merge.bw").replace("-agg","") in_list = " --bamfile=".join(infiles) offsets = [] for t in infiles: track = P.snip(os.path.basename(t), ".norm.bam") fn = "macs/with_input/%s.macs" % track if os.path.exists( fn ): offsets.append( str(PIntervals.getPeakShiftFromMacs( fn )) ) shifts = " --shift=".join(offsets) statement = '''python %(scriptsdir)s/bam2wiggle.py --output-format=bigwig %(in_list)s %(shifts)s > %(outfile)s''' P.run()
def loadMergedIntervals(infile, outfile): '''load combined intervals. Also, re-evaluate the intervals by counting reads within the interval. In contrast to the initial pipeline, the genome is not binned. In particular, the meaning of the columns in the table changes to: nProbes: number of reads in interval PeakCenter: position with maximum number of reads in interval AvgVal: average coverage within interval If *replicates* is true, only replicates will be considered for the counting. Otherwise the counts aggregate both replicates and conditions. ''' # Write header to output file tmpfile = tempfile.NamedTemporaryFile(delete=False) headers = ("contig", "start", "end", "interval_id", "nPeaks", "PeakCenter", "Length", "AvgVal", "PeakVal", "nProbes", "Fold") tmpfile.write("\t".join(headers) + "\n") contig, start, end, interval_id, npeaks, peakcenter, length, avgval, peakval, nprobes = "", 0, 0, 0, 0, 0, 0, 0, 0, 0 # Get SAM file and Macs offset samfiles, offsets = [], [] track = P.snip(os.path.basename(infile), ".merged.cleaned.bed") base_track = track.replace(".solo", "") fn = "bam/%s.norm.bam" % track assert os.path.exists( fn), "could not find bamfile %s for track %s" % (fn, track) samfiles.append(pysam.Samfile(fn, "rb")) if track.find("solo") > -1: fn = "macs/no_input/%s.macs" % track else: fn = "macs/with_input/%s.macs" % track if os.path.exists(fn): offsets.append(PIntervals.getPeakShiftFromMacs(fn)) # Loop over input Bed file and calculate stats for merged intervals c = E.Counter() for line in open(infile, "r"): c.input += 1 contig, start, end, int_id, fc = line[:-1].split()[:5] start, end = int(start), int(end) interval_id = c.input npeaks, peakcenter, length, avgval, peakval, nprobes = PIntervals.countPeaks( contig, start, end, samfiles, offsets) # nreads can be 0 if the intervals overlap only slightly # and due to the binning, no reads are actually in the overlap region. # However, most of these intervals should be small and have already be deleted via # the merge_min_interval_length cutoff. # do not output intervals without reads. if nprobes == 0: c.skipped_reads += 1 c.output += 1 tmpfile.write("\t".join( map(str, (contig, start, end, int_id, npeaks, peakcenter, length, avgval, peakval, nprobes, fc))) + "\n") tmpfile.close() tmpfilename = tmpfile.name tablename = "%s_macs_merged_intervals" % track statement = '''python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=interval_id --index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name) L.info("%s\n" % str(c))
def loadMergedIntervals( infile, outfile ): '''load combined intervals. Also, re-evaluate the intervals by counting reads within the interval. In contrast to the initial pipeline, the genome is not binned. In particular, the meaning of the columns in the table changes to: nProbes: number of reads in interval PeakCenter: position with maximum number of reads in interval AvgVal: average coverage within interval If *replicates* is true, only replicates will be considered for the counting. Otherwise the counts aggregate both replicates and conditions. ''' # Write header to output file tmpfile = tempfile.NamedTemporaryFile(delete=False) headers = ( "contig","start","end","interval_id","nPeaks","PeakCenter","Length","AvgVal","PeakVal","nProbes", "Fold" ) tmpfile.write( "\t".join(headers) + "\n" ) contig,start,end,interval_id,npeaks,peakcenter,length,avgval,peakval,nprobes = "",0,0,0,0,0,0,0,0,0 # Get SAM file and Macs offset samfiles, offsets = [], [] track = P.snip( os.path.basename(infile), ".merged.cleaned.bed") base_track = track.replace(".solo","") fn = "bam/%s.norm.bam" % track assert os.path.exists( fn ), "could not find bamfile %s for track %s" % ( fn, track) samfiles.append( pysam.Samfile( fn, "rb" ) ) if track.find("solo") > -1: fn = "macs/no_input/%s.macs" % track else: fn = "macs/with_input/%s.macs" % track if os.path.exists( fn ): offsets.append( PIntervals.getPeakShiftFromMacs( fn ) ) # Loop over input Bed file and calculate stats for merged intervals c = E.Counter() for line in open(infile, "r"): c.input += 1 contig, start, end, int_id, fc = line[:-1].split()[:5] start, end = int(start), int(end) interval_id = c.input npeaks, peakcenter, length, avgval, peakval, nprobes = PIntervals.countPeaks( contig, start, end, samfiles, offsets ) # nreads can be 0 if the intervals overlap only slightly # and due to the binning, no reads are actually in the overlap region. # However, most of these intervals should be small and have already be deleted via # the merge_min_interval_length cutoff. # do not output intervals without reads. if nprobes == 0: c.skipped_reads += 1 c.output += 1 tmpfile.write( "\t".join( map( str, (contig,start,end,int_id,npeaks,peakcenter,length,avgval,peakval,nprobes,fc) )) + "\n" ) tmpfile.close() tmpfilename = tmpfile.name tablename = "%s_macs_merged_intervals" % track statement = '''python %(scriptsdir)s/csv2db.py %(csv2db_options)s --index=interval_id --index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink( tmpfile.name ) L.info( "%s\n" % str(c) )