def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0): '''Module to generate rMATS statment Module offers the option to permute group name labels and calculates readlength, which must be identical in all reads. Arguments --------- gtffile: string path to :term:`gtf` file designfile: string path to design file pvalue: string threshold for FDR testing strand: string strandedness option: can be 'fr-unstranded', 'fr-firststrand', or 'fr-secondstrand' outdir: string directory path for rMATS results permute : 1 or 0 option to activate random shuffling of sample groups ''' design = Expression.ExperimentalDesign(designfile) if permute == 1: design.table.group = random.choice( list(itertools.permutations(design.table.group))) group1 = ",".join( ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[0])]) with open(outdir + "/b1.txt", "w") as f: f.write(group1) group2 = ",".join( ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[1])]) with open(outdir + "/b2.txt", "w") as f: f.write(group2) readlength = BamTools.estimateTagSize(design.samples[0] + ".bam") statement = '''rMATS --b1 %(outdir)s/b1.txt --b2 %(outdir)s/b2.txt --gtf <(gunzip -c %(gtffile)s) --od %(outdir)s --readLength %(readlength)s --cstat %(pvalue)s --libType %(strand)s ''' % locals() # if Paired End Reads if BamTools.isPaired(design.samples[0] + ".bam"): statement += '''-t paired''' % locals() statement += ''' > %(outdir)s/%(designfile)s.log ''' P.run()
def buildPicardGCStats(infile, outfile, genome_file): """picard:CollectGCBiasMetrics Collect GC bias metrics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. """ job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectGcBiasMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT CHART_OUTPUT=%(outfile)s.pdf SUMMARY_OUTPUT=%(outfile)s.summary >& %(outfile)s''' P.run()
def isPaired(filename): '''return "T" if bamfile contains paired end reads.''' if BamTools.isPaired(filename): return "T" else: return "F"
def bamToBed(infile, outfile, min_insert_size=0, max_insert_size=1000): """convert bam to bed with bedtools.""" scriptsdir = "/ifs/devel/andreas/cgat/scripts" if BamTools.isPaired(infile): # output strand as well statement = [ "cat %(infile)s " "| python %(scriptsdir)s/bam2bed.py " "--merge-pairs " "--min-insert-size=%(min_insert_size)i " "--max-insert-size=%(max_insert_size)i " "--log=%(outfile)s.log " "--bed-format=6 " "> %(outfile)s" % locals() ] else: statement = "bamToBed -i %(infile)s > %(outfile)s" % locals() E.debug("executing statement '%s'" % statement) retcode = subprocess.call(statement, cwd=os.getcwd(), shell=True) if retcode < 0: raise OSError("Child was terminated by signal %i: \n%s\n" % (-retcode, statement)) return outfile
def buildPicardAlignmentStats(infile, outfile, genome_file): '''run picard:CollectMultipleMetrics Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectMultipleMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildGeneLevelReadCounts(infiles, outfile): '''compute read counts and coverage of exons with reads. ''' bamfile, exons = infiles if BamTools.isPaired(bamfile): counter = 'readpair-counts' else: counter = 'read-counts' # ignore multi-mapping reads statement = ''' zcat %(exons)s | python %(scriptsdir)s/gtf2table.py --reporter=genes --bam-file=%(bamfile)s --counter=length --prefix="exons_" --counter=%(counter)s --prefix="" --counter=read-coverage --prefix=coverage_ --min-mapping-quality=%(counting_min_mapping_quality)i --multi-mapping=ignore --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def buildTranscriptLevelReadCounts(infiles, outfile): '''count reads falling into transcripts of protein coding gene models. .. note:: In paired-end data sets each mate will be counted. Thus the actual read counts are approximately twice the fragment counts. ''' bamfile, geneset = infiles if BamTools.isPaired(bamfile): counter = 'readpair-counts' else: counter = 'read-counts' statement = ''' zcat %(geneset)s | python %(scriptsdir)s/gtf2table.py --reporter=transcripts --bam-file=%(bamfile)s --counter=length --prefix="exons_" --counter=%(counter)s --prefix="" --counter=read-coverage --prefix=coverage_ --min-mapping-quality=%(counting_min_mapping_quality)i --multi-mapping=ignore --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def runFeatureCounts(annotations_file, bamfile, outfile, nthreads=4, strand=2, options=""): '''run feature counts on *annotations_file* with *bam_file*. If the bam-file is paired, paired-end counting is enabled and the bam file automatically sorted. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" job_threads = nthreads # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(nthreads)i -s %(strand)s -b -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def bamToBed(infile, outfile, min_insert_size=0, max_insert_size=1000): '''convert bam to bed with bedtools.''' scriptsdir = "/ifs/devel/andreas/cgat/scripts" if BamTools.isPaired(infile): # output strand as well statement = [ 'cat %(infile)s ' '| python %(scriptsdir)s/bam2bed.py ' '--merge-pairs ' '--min-insert-size=%(min_insert_size)i ' '--max-insert-size=%(max_insert_size)i ' '--log=%(outfile)s.log ' '--bed-format=6 ' '> %(outfile)s' % locals() ] else: statement = "bamToBed -i %(infile)s > %(outfile)s" % locals() E.debug("executing statement '%s'" % statement) retcode = subprocess.call(statement, cwd=os.getcwd(), shell=True) if retcode < 0: raise OSError("Child was terminated by signal %i: \n%s\n" % (-retcode, statement)) return outfile
def buildPicardCoverageStats(infile, outfile, baits, regions): '''run picard:CollectHsMetrics Generate coverage statistics for regions of interest from a bed file using Picard. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. baits : :term:`bed` formatted file of bait regions regions : :term:`bed` formatted file of target regions ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectHsMetrics BAIT_INTERVALS=%(baits)s TARGET_INTERVALS=%(regions)s INPUT=%(infile)s OUTPUT=%(outfile)s VALIDATION_STRINGENCY=LENIENT''' % locals() P.run()
def buildPicardRnaSeqMetrics(infiles, strand, outfile): '''run picard:RNASeqMetrics Arguments --------- infiles : string Input filename in :term:`BAM` format. Genome file in refflat format (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat) outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 infile, genome = infiles if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s CollectRnaSeqMetrics REF_FLAT=%(genome)s INPUT=%(infile)s ASSUME_SORTED=true OUTPUT=%(outfile)s STRAND=%(strand)s VALIDATION_STRINGENCY=SILENT ''' P.run()
def buildPicardInsertSizeStats(infile, outfile, genome_file): '''run Picard:CollectInsertSizeMetrics Collect insert size statistics. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''CollectInsertSizeMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildPicardDuplicateStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard and keep the dedupped .bam file. Pair duplication is properly handled, including inter-chromosomal cases. SE data is also handled. These stats also contain a histogram that estimates the return from additional sequecing. No marked bam files are retained (/dev/null...) Note that picards counts reads but they are in fact alignments. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return statement = '''picard %(picard_opts)s MarkDuplicates INPUT=%(infile)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s.duplicate_metrics OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT; ''' statement += '''samtools index %(outfile)s ;''' P.run()
def buildPicardDuplicationStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard, the marked records are discarded. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # currently, MarkDuplicates cannot handle split alignments from gsnap # these can be identified by the custom XT tag. if ".gsnap.bam" in infile: tmpf = P.getTempFile(".") tmpfile_name = tmpf.name statement = '''samtools view -h %(infile)s | awk "!/\\tXT:/" | samtools view /dev/stdin -S -b > %(tmpfile_name)s; ''' % locals() data_source = tmpfile_name else: statement = "" data_source = infile os.environ["CGAT_JAVA_OPTS"] = "-Xmx%s -XX:+UseParNewGC\ -XX:+UseConcMarkSweepGC" % (PICARD_MEMORY) statement += '''MarkDuplicates INPUT=%(data_source)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s OUTPUT=/dev/null VALIDATION_STRINGENCY=SILENT ''' P.run() os.unsetenv("CGAT_JAVA_OPTS") if ".gsnap.bam" in infile: os.unlink(tmpfile_name)
def SPMRWithMACS2(infile, outfile): '''Calculate signal per million reads with MACS2, output bedGraph''' # --SPMR ask MACS2 to generate pileup signal file of 'fragment pileup per million reads' sample = infile WCE = sample.replace("-sample", "-WCE") name = P.snip(outfile, ".Macs2SPMR.log").split("/")[-1] fragment_size = PARAMS["macs2_fragment_size"] job_memory = "10G" if BamTools.isPaired(sample): statement = '''macs2 callpeak --format=BAMPE --treatment %(sample)s --verbose=10 --name=%(name)s --outdir=macs2.dir --qvalue=0.1 --bdg --SPMR --control %(WCE)s --mfold 5 50 --gsize 1.87e9 >& %(outfile)s''' % locals() else: statement = '''macs2 callpeak --format=BAM --treatment %(sample)s --verbose=10 --name=%(name)s --outdir=macs2.dir --qvalue=0.1 --bdg --SPMR --control %(WCE)s --tsize %(fragment_size)s --mfold 5 50 --gsize 1.87e9 >& %(outfile)s''' % locals() print statement P.run()
def countDEXSeq(infiles, outfile): '''create counts for DEXSeq Counts bam reads agains exon features in flattened gtf. The required python script is provided by DEXSeq and uses HTSeqCounts. Parameters ---------- infile[0]: string :term:`bam` file input infile[1]: string :term:`gff` output from buildGff function outfile : string A :term:`txt` file containing results DEXSeq_strandedness : string :term:`PARAMS`. Specifies strandedness, options are 'yes', 'no' and 'reverse' ''' infile, gfffile = infiles ps = PYTHONSCRIPTSDIR if BamTools.isPaired(infile): paired = "yes" else: paired = "no" strandedness = PARAMS["DEXSeq_strandedness"] statement = '''python %(ps)s/dexseq_count.py -p %(paired)s -s %(strandedness)s -r pos -f bam %(gfffile)s %(infile)s %(outfile)s''' P.run()
def buildPicardAlignmentStats(infile, outfile, genome_file): '''run picard:CollectMultipleMetrics Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitly. statement = '''cat %(infile)s | cgat bam2bam -v 0 --method=set-sequence --output-sam | picard %(picard_opts)s CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def buildPicardAlignmentStats(infile, outfile, genome_file): '''run picard:CollectMultipleMetrics Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. genome_file : string Filename with genomic sequence. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals() job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) P.touch(outfile) return # Picard seems to have problem if quality information is missing # or there is no sequence/quality information within the bam file. # Thus, add it explicitly. statement = '''cat %(infile)s | cgat bam2bam -v 0 --method=set-sequence --output-sam | picard %(picard_opts)s CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%(genome_file)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s''' P.run()
def convertReadsToIntervals(bamfile, bedfile, filtering_quality=None, filtering_dedup=None, filtering_dedup_method='picard'): '''convert reads in *bamfile* to *intervals*. This method converts read data into intervals for counting based methods. This method is not appropriated for RNA-Seq. Optional steps include: * deduplication - remove duplicate reads * quality score filtering - remove reads below a certain quality score. * paired ended data - merge pairs * paired ended data - filter by insert size ''' track = P.snip(bedfile, ".bed.gz") is_paired = BamTools.isPaired(bamfile) current_file = bamfile tmpdir = P.getTempFilename() statement = ["mkdir %(tmpdir)s"] nfiles = 0 if filtering_quality > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %(filtering_quality)i -b %(current_file)s 2>> %%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_dedup is not None: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() if filtering_dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif filtering_dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=TRUE METRICS_FILE=%(bedfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(bedfile)s.log ''' % locals()) nfiles += 1 current_file = next_file if is_paired: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --merge-pairs --min-insert-size=%(filtering_min_insert_size)i --max-insert-size=%(filtering_max_insert_size)i --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') else: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') statement.append("tabix -p bed %(bedfile)s") statement.append("rm -rf %(tmpdir)s") statement = " ; ".join(statement) P.run() os.unlink(tmpdir)
def runFeatureCounts(annotations_file, bamfile, outfile, job_threads=4, strand=0, options=""): '''run FeatureCounts to collect read counts. If `bamfile` is paired, paired-end counting is enabled and the bam file automatically sorted. Arguments --------- annotations_file : string Filename with gene set in :term:`gtf` format. bamfile : string Filename with short reads in :term:`bam` format. outfile : string Output filename in :term:`tsv` format. job_threads : int Number of threads to use. strand : int Strand option in FeatureCounts. options : string Options to pass on to FeatureCounts. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # sort by read name paired_processing = \ """samtools sort -@ %(job_threads)i -n -o %(bam_tmp)s %(bamfile)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(job_threads)i -s %(strand)s -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def loadZinba(infile, outfile, bamfile, tablename=None, controlfile=None): '''load Zinba results in *tablename* This method loads only positive peaks. It filters peaks by p-value, q-value and fold change and loads the diagnostic data and re-calculates peakcenter, peakval, ... using the supplied bamfile. If *tablename* is not given, it will be :file:`<track>_intervals` where track is derived from ``infile`` and assumed to end in :file:`.zinba`. If no peaks were predicted, an empty table is created. This method creates :file:`<outfile>.tsv.gz` with the results of the filtering. This method uses the refined peak locations. Zinba peaks can be overlapping. This method does not merge overlapping intervals. Zinba calls peaks in regions where there are many reads inside the control. Thus this method applies a filtering step removing all intervals in which there is a peak of more than readlength / 2 height in the control. .. note: Zinba calls peaks that are overlapping. ''' track = P.snip(os.path.basename(infile), ".zinba") folder = os.path.dirname(infile) infilename = infile + ".peaks" outtemp = P.getTempFile(".") tmpfilename = outtemp.name outtemp.write("\t".join(( "interval_id", "contig", "start", "end", "npeaks", "peakcenter", "length", "avgval", "peakval", "nprobes", "pvalue", "fold", "qvalue", "macs_summit", "macs_nprobes", )) + "\n") counter = E.Counter() if not os.path.exists(infilename): E.warn("could not find %s" % infilename) elif IOTools.isEmpty(infilename): E.warn("no data in %s" % infilename) else: # filter peaks shift = getPeakShiftFromZinba(infile) assert shift is not None, \ "could not determine peak shift from Zinba file %s" % infile E.info("%s: found peak shift of %i" % (track, shift)) samfiles = [pysam.Samfile(bamfile, "rb")] offsets = [shift / 2] if controlfile: controlfiles = [pysam.Samfile(controlfile, "rb")] readlength = BamTools.estimateTagSize(controlfile) control_max_peakval = readlength // 2 E.info("removing intervals in which control has peak higher than %i reads" % control_max_peakval) else: controlfiles = None id = 0 # get thresholds max_qvalue = float(PARAMS["zinba_fdr_threshold"]) with IOTools.openFile(infilename, "r") as ins: for peak in WrapperZinba.iteratePeaks(ins): # filter by qvalue if peak.fdr > max_qvalue: counter.removed_qvalue += 1 continue assert peak.refined_start < peak.refined_end # filter by control if controlfiles: npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig, peak.refined_start, peak.refined_end, controlfiles, offsets) if peakval > control_max_peakval: counter.removed_control += 1 continue # output peak npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig, peak.refined_start, peak.refined_end, samfiles, offsets) outtemp.write("\t".join(map(str, ( id, peak.contig, peak.refined_start, peak.refined_end, npeaks, peakcenter, length, avgval, peakval, nreads, 1.0 - peak.posterior, 1.0, peak.fdr, peak.refined_start + peak.summit - 1, peak.height))) + "\n") id += 1 counter.output += 1 outtemp.close() # output filtering summary outf = IOTools.openFile("%s.tsv.gz" % outfile, "w") outf.write("category\tcounts\n") outf.write("%s\n" % counter.asTable()) outf.close() E.info("%s filtering: %s" % (track, str(counter))) if counter.output == 0: E.warn("%s: no peaks found" % track) # load data into table if tablename is None: tablename = "%s_intervals" % track statement = ''' cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --add-index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def convertReadsToIntervals(bamfile, bedfile, filtering_quality=None, filtering_dedup=None, filtering_dedup_method='picard', filtering_nonunique=False): '''convert reads in *bamfile* to *intervals*. This method converts read data into intervals for counting based methods. This method is not appropriate for RNA-Seq. Optional steps include: For paired end data, pairs are merged and optionally filtered by insert size. Arguments --------- bamfile : string Filename of input file in :term:`bam` format. bedfile : string Filename of output file in :term:`bed` format. filtering_quality : int If set, remove reads with a quality score below given threshold. filtering_dedup : bool If True, deduplicate data. filtering_dedup_method : string Deduplication method. Possible options are ``picard`` and ``samtools``. filtering_nonunique : bool If True, remove non-uniquely matching reads. ''' track = P.snip(bedfile, ".bed.gz") is_paired = BamTools.isPaired(bamfile) current_file = bamfile tmpdir = P.getTempFilename() statement = ["mkdir %(tmpdir)s"] nfiles = 0 if filtering_quality > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %(filtering_quality)i -b %(current_file)s 2>> %%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_nonunique: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''cat %(current_file)s | python %%(scriptsdir)s/bam2bam.py --method=filter --filter-method=unique,mapped --log=%%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_dedup is not None: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() if filtering_dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif filtering_dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=TRUE METRICS_FILE=%(bedfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(bedfile)s.log ''' % locals()) nfiles += 1 current_file = next_file if is_paired: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --merge-pairs --min-insert-size=%(filtering_min_insert_size)i --max-insert-size=%(filtering_max_insert_size)i --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') else: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') statement.append("tabix -p bed %(bedfile)s") statement.append("rm -rf %(tmpdir)s") statement = " ; ".join(statement) P.run()
def runFeatureCounts(annotations_file, bamfile, outfile, nthreads=4, strand=2, options=""): '''run feature counts on *annotations_file* with *bam_file*. If the bam-file is paired, paired-end counting is enabled and the bam file automatically sorted. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, bamfile) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" job_options = "-pe dedicated %i" % nthreads # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(nthreads)i -s %(strand)s -b -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def runFeatureCounts(annotations_file, bamfile, outfile, job_threads=4, strand=0, options=""): '''run FeatureCounts to collect read counts. If `bamfile` is paired, paired-end counting is enabled and the bam file automatically sorted. Arguments --------- annotations_file : string Filename with gene set in :term:`gtf` format. bamfile : string Filename with short reads in :term:`bam` format. outfile : string Output filename in :term:`tsv` format. job_threads : int Number of threads to use. strand : int Strand option in FeatureCounts. options : string Options to pass on to FeatureCounts. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(job_threads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(job_threads)i -s %(strand)s -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def convertReadsToIntervals(bamfile, bedfile, filtering_quality=None, filtering_dedup=None, filtering_dedup_method='picard', filtering_nonunique=False): '''convert reads in *bamfile* to *intervals*. This method converts read data into intervals for counting based methods. This method is not appropriate for RNA-Seq. Optional steps include: For paired end data, pairs are merged and optionally filtered by insert size. Arguments --------- bamfile : string Filename of input file in :term:`bam` format. bedfile : string Filename of output file in :term:`bed` format. filtering_quality : int If set, remove reads with a quality score below given threshold. filtering_dedup : bool If True, deduplicate data. filtering_dedup_method : string Deduplication method. Possible options are ``picard`` and ``samtools``. filtering_nonunique : bool If True, remove non-uniquely matching reads. ''' track = P.snip(bedfile, ".bed.gz") is_paired = BamTools.isPaired(bamfile) current_file = bamfile tmpdir = P.getTempFilename() statement = ["mkdir %(tmpdir)s"] nfiles = 0 if filtering_quality > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %(filtering_quality)i -b %(current_file)s 2>> %%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_nonunique: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''cat %(current_file)s | python %%(scriptsdir)s/bam2bam.py --method=filter --filter-method=unique,mapped --log=%%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_dedup is not None: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() if filtering_dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif filtering_dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=TRUE METRICS_FILE=%(bedfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(bedfile)s.log ''' % locals()) nfiles += 1 current_file = next_file if is_paired: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --merge-pairs --min-insert-size=%(filtering_min_insert_size)i --max-insert-size=%(filtering_max_insert_size)i --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') else: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') statement.append("tabix -p bed %(bedfile)s") statement.append("rm -rf %(tmpdir)s") statement = " ; ".join(statement) P.run() os.unlink(tmpdir)