def intersectBedFiles(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]): P.touch(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run() else: tmpfile = P.getTempFilename(".") # need to merge incrementally fn = infiles[0] if IOTools.isEmpty(infiles[0]): P.touch(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run() for fn in infiles[1:]: if IOTools.isEmpty(infiles[0]): P.touch(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run() statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ''' P.run() os.unlink(tmpfile)
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") to_cluster = True databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def subtractBedFiles(infile, subtractfile, outfile): '''subtract intervals in *subtractfile* from *infile* and store in *outfile*. ''' if IOTools.isEmpty(subtractfile): shutil.copyfile(infile, outfile) return elif IOTools.isEmpty(infile): P.touch(outfile) return statement = ''' intersectBed -v -a %(infile)s -b %(subtractfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' | bgzip > %(outfile)s ; tabix -p bed %(outfile)s ''' P.run()
def loadMemeSummary( infiles, outfile ): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\n" ) for infile in infiles: if IOTools.isEmpty( infile ): continue motif = P.snip( infile, ".meme" ) outf.write( "%s\n" % motif ) outf.close() P.load( outf.name, outfile ) os.unlink( outf.name )
def loadMotifInformation( infiles, outfile ): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("motif\n" ) for infile in infiles: if IOTools.isEmpty( infile ): continue motif = P.snip( infile, ".motif" ) outf.write( "%s\n" % motif ) outf.close() P.load( outf.name, outfile, "--allow-empty" ) os.unlink( outf.name )
def loadMemeChipSummary( infiles, outfile ): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("track\tnpeaks\twidth\tmasking\tpath\n" ) for infile in infiles: if IOTools.isEmpty( infile ): continue fn = P.snip(os.path.basename( infile ), ".memechip" ) track, npeaks, width, masking = fn.split(".") outf.write( "\t".join( map(str,(track, npeaks, width, masking, fn)) ) + "\n" ) outf.close() P.load( outf.name, outfile ) os.unlink( outf.name )
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.getTempFile(".") outf.write("method\ttrack\n") for infile in infiles: if IOTools.isEmpty(infile): continue method = re.match("(.+).dir/", infile).groups()[0] track = os.path.basename(".".join(infile.split(".")[:-1])) outf.write("%s\t%s\n" % (method, track)) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def buildPseudogenes(infiles, outfile, dbhandle): '''build a set of pseudogenes. Transcripts are extracted from the GTF file and designated as pseudogenes if: * the gene_type or transcript_type contains the phrase "pseudo". This taken is from the database. * the feature is 'processed_transcript' and has similarity to protein coding genes. Similarity is assessed by aligning the transcript and peptide set against each other with exonerate_. Pseudogenic transcripts can overlap with protein coding transcripts. Arguments --------- infiles : list Filenames of ENSEMBL geneset in :term:`gtf` format and associated peptide sequences in :term:`fasta` format. outfile : filename Output in :term:`gtf` format with inferred or annotated pseudogenes. dbandle : object Database handle for extracting transcript biotypes. ''' infile_gtf, infile_peptides_fasta = infiles # JJ - there are also 'nontranslated_CDS', but no explanation of these if PARAMS["genome"].startswith("dm"): E.warn("Ensembl dm genome annotations only contain source" " 'pseudogenes' - skipping exonerate step") statement = """zcat %(infile_gtf)s |awk '$2 ~ /pseudogene/' | gzip > %(outfile)s""" P.run() return tmpfile1 = P.getTempFilename(shared=True) # collect processed transcripts and save as fasta sequences statement = ''' zcat %(infile_gtf)s | awk '$2 ~ /processed/' | python %(scriptsdir)s/gff2fasta.py --is-gtf --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(tmpfile1)s ''' P.run() if IOTools.isEmpty(tmpfile1): E.warn("no pseudogenes found") os.unlink(tmpfile1) P.touch(outfile) return model = "protein2dna" # map processed transcripts against peptide sequences statement = ''' cat %(tmpfile1)s | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(infile_peptides_fasta)s --model %(model)s --bestn 1 --score 200 --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\" --showalignment no --showsugar no --showcigar no --showvulgar no " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run() os.unlink(tmpfile1) inf = IOTools.openFile("%s.links.gz" % outfile) best_matches = {} for line in inf: peptide_id, transcript_id, score = line[:-1].split("\t") score = int(score) if transcript_id in best_matches and \ best_matches[transcript_id][0] > score: continue best_matches[transcript_id] = (score, peptide_id) inf.close() E.info("found %i best links" % len(best_matches)) new_pseudos = set(best_matches.keys()) cc = dbhandle.cursor() known_pseudos = set([ x[0] for x in cc.execute("""SELECT DISTINCT transcript_id FROM transcript_info WHERE transcript_biotype like '%pseudo%' OR gene_biotype like '%pseudo%' """) ]) E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, " "intersection=%i" % ((len(new_pseudos), len(known_pseudos), len(new_pseudos.intersection(known_pseudos))))) all_pseudos = new_pseudos.union(known_pseudos) c = E.Counter() outf = IOTools.openFile(outfile, "w") inf = GTF.iterator(IOTools.openFile(infile_gtf)) for gtf in inf: c.input += 1 if gtf.transcript_id not in all_pseudos: continue c.output += 1 outf.write("%s\n" % gtf) outf.close() E.info("exons: %s" % str(c))
def buildNUMTs(infile, outfile): '''output set of potential nuclear mitochondrial genes (NUMTs). This function works by aligning the mitochondrial chromosome against genome using exonerate_. This can take a while. Arguments --------- infile : string Ignored. outfile : filename Output in :term:`gtf` format with potential NUMTs. ''' if not PARAMS["numts_mitochrom"]: E.info("skipping numts creation") P.touch(outfile) return fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) if PARAMS["numts_mitochrom"] not in fasta: E.warn("mitochondrial genome %s not found" % PARAMS["numts_mitochrom"]) P.touch(outfile) return tmpfile_mito = P.getTempFilename(".") statement = ''' python %(scriptsdir)s/index_fasta.py --extract=%(numts_mitochrom)s --log=%(outfile)s.log %(genome_dir)s/%(genome)s > %(tmpfile_mito)s ''' P.run() if IOTools.isEmpty(tmpfile_mito): E.warn("mitochondrial genome empty.") os.unlink(tmpfile_mito) P.touch(outfile) return format = ("qi", "qS", "qab", "qae", "ti", "tS", "tab", "tae", "s", "pi", "C") format = "\\\\t".join(["%%%s" % x for x in format]) # collect all results min_score = 100 statement = ''' cat %(genome_dir)s/%(genome)s.fasta | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=1 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(tmpfile_mito)s --model affine:local --score %(min_score)i --showalignment no --showsugar no --showcigar no --showvulgar no --ryo \\"%(format)s\\n\\" " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run() # convert to gtf inf = IOTools.openFile("%s.links.gz" % outfile) outf = IOTools.openFile(outfile, "w") min_score = PARAMS["numts_score"] c = E.Counter() for line in inf: (query_contig, query_strand, query_start, query_end, target_contig, target_strand, target_start, target_end, score, pid, alignment) = line[:-1].split("\t") c.input += 1 score = int(score) if score < min_score: c.skipped += 1 continue if target_strand == "-": target_start, target_end = target_end, target_start gff = GTF.Entry() gff.contig = target_contig gff.start, gff.end = int(target_start), int(target_end) assert gff.start < gff.end gff.strand = target_strand gff.score = int(score) gff.feature = "numts" gff.gene_id = "%s:%s-%s" % (query_contig, query_start, query_end) gff.transcript_id = "%s:%s-%s" % (query_contig, query_start, query_end) outf.write("%s\n" % str(gff)) c.output += 1 inf.close() outf.close() E.info("filtering numts: %s" % str(c)) os.unlink(tmpfile_mito)
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.isEmpty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.getTempDir(".") tmpfile = P.getTempFilename(".") for motiffile in motiffiles: if IOTools.isEmpty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() statement = "gzip < %(tmpfile)s > %(outfile)s" P.run() shutil.rmtree(tmpdir) os.unlink(tmpfile)
def BedFileVenn(infiles, outfile): '''merge :term:`bed` formatted *infiles* by intersection and write to *outfile*. Only intervals that overlap in all files are retained. Interval coordinates are given by the first file in *infiles*. Bed files are normalized (overlapping intervals within a file are merged) before intersection. Intervals are renumbered starting from 1. ''' bed1, bed2 = infiles liver_name = P.snip(os.path.basename(liver), ".replicated.bed") testes_name = P.snip(os.path.basename(testes), ".replicated.bed") to_cluster = True statement = '''cat %(liver)s %(testes)s | mergeBed -i stdin | awk 'OFS="\\t" {print $1,$2,$3,"CAPseq"NR}' > replicated_intervals/liver.testes.merge.bed; echo "Total merged intervals" > %(outfile)s; cat replicated_intervals/liver.testes.merge.bed | wc -l >> %(outfile)s; echo "Liver & testes" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -u | intersectBed -a stdin -b %(testes)s -u > replicated_intervals/liver.testes.shared.bed; cat replicated_intervals/liver.testes.shared.bed | wc -l >> %(outfile)s; echo "Testes only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(liver)s -v > replicated_intervals/%(testes_name)s.liver.testes.unique.bed; cat replicated_intervals/%(testes_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; echo "Liver only" >> %(outfile)s; intersectBed -a replicated_intervals/liver.testes.merge.bed -b %(testes)s -v > replicated_intervals/%(liver_name)s.liver.testes.unique.bed; cat replicated_intervals/%(liver_name)s.liver.testes.unique.bed | wc -l >> %(outfile)s; sed -i '{N;s/\\n/\\t/g}' %(outfile)s; ''' if len(infiles) == 1: shutil.copyfile(infiles[0], outfile) elif len(infiles) == 2: if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]): P.touch(outfile) else: statement = ''' intersectBed -u -a %s -b %s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %%(outfile)s ''' % (infiles[0], infiles[1]) P.run() else: tmpfile = P.getTempFilename(".") # need to merge incrementally fn = infiles[0] if IOTools.isEmpty(infiles[0]): P.touch(outfile) return statement = '''mergeBed -i %(fn)s > %(tmpfile)s''' P.run() for fn in infiles[1:]: if IOTools.isEmpty(infiles[0]): P.touch(outfile) os.unlink(tmpfile) return statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s''' P.run() statement = '''cat %(tmpfile)s | cut -f 1,2,3,4,5 | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}' > %(outfile)s ''' P.run() os.unlink(tmpfile)
def loadZinba(infile, outfile, bamfile, tablename=None, controlfile=None): '''load Zinba results in *tablename* This method loads only positive peaks. It filters peaks by p-value, q-value and fold change and loads the diagnostic data and re-calculates peakcenter, peakval, ... using the supplied bamfile. If *tablename* is not given, it will be :file:`<track>_intervals` where track is derived from ``infile`` and assumed to end in :file:`.zinba`. If no peaks were predicted, an empty table is created. This method creates :file:`<outfile>.tsv.gz` with the results of the filtering. This method uses the refined peak locations. Zinba peaks can be overlapping. This method does not merge overlapping intervals. Zinba calls peaks in regions where there are many reads inside the control. Thus this method applies a filtering step removing all intervals in which there is a peak of more than readlength / 2 height in the control. .. note: Zinba calls peaks that are overlapping. ''' track = P.snip(os.path.basename(infile), ".zinba") folder = os.path.dirname(infile) infilename = infile + ".peaks" outtemp = P.getTempFile(".") tmpfilename = outtemp.name outtemp.write("\t".join(( "interval_id", "contig", "start", "end", "npeaks", "peakcenter", "length", "avgval", "peakval", "nprobes", "pvalue", "fold", "qvalue", "macs_summit", "macs_nprobes", )) + "\n") counter = E.Counter() if not os.path.exists(infilename): E.warn("could not find %s" % infilename) elif IOTools.isEmpty(infilename): E.warn("no data in %s" % infilename) else: # filter peaks shift = getPeakShiftFromZinba(infile) assert shift is not None, \ "could not determine peak shift from Zinba file %s" % infile E.info("%s: found peak shift of %i" % (track, shift)) samfiles = [pysam.Samfile(bamfile, "rb")] offsets = [shift / 2] if controlfile: controlfiles = [pysam.Samfile(controlfile, "rb")] readlength = BamTools.estimateTagSize(controlfile) control_max_peakval = readlength // 2 E.info( "removing intervals in which control has peak higher than %i reads" % control_max_peakval) else: controlfiles = None id = 0 # get thresholds max_qvalue = float(PARAMS["zinba_fdr_threshold"]) with IOTools.openFile(infilename, "r") as ins: for peak in WrapperZinba.iteratePeaks(ins): # filter by qvalue if peak.fdr > max_qvalue: counter.removed_qvalue += 1 continue assert peak.refined_start < peak.refined_end # filter by control if controlfiles: npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks( peak.contig, peak.refined_start, peak.refined_end, controlfiles, offsets) if peakval > control_max_peakval: counter.removed_control += 1 continue # output peak npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks( peak.contig, peak.refined_start, peak.refined_end, samfiles, offsets) outtemp.write("\t".join( map(str, (id, peak.contig, peak.refined_start, peak.refined_end, npeaks, peakcenter, length, avgval, peakval, nreads, 1.0 - peak.posterior, 1.0, peak.fdr, peak.refined_start + peak.summit - 1, peak.height))) + "\n") id += 1 counter.output += 1 outtemp.close() # output filtering summary outf = IOTools.openFile("%s.tsv.gz" % outfile, "w") outf.write("category\tcounts\n") outf.write("%s\n" % counter.asTable()) outf.close() E.info("%s filtering: %s" % (track, str(counter))) if counter.output == 0: E.warn("%s: no peaks found" % track) # load data into table if tablename is None: tablename = "%s_intervals" % track statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s --allow-empty-file --add-index=interval_id --add-index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--set-nh", dest="set_nh", action="store_true", help="sets the NH flag. The file needs to be " "sorted by readname [%default]") parser.add_option("--unset-unmapped-mapq", dest="unset_unmapped_mapq", action="store_true", help="sets the mapping quality of unmapped " "reads to 0 [%default]") parser.add_option("--set-sequence", dest="set_sequence", action="store_true", help="sets the sequence to 'A's (a valid base) and " "the quality to 'F's " ",which is defined in all fastq scoring schemes " "[%default]") parser.add_option("--strip", dest="strip", type="choice", choices=("sequence", "quality", "match"), help = "remove parts of the bam-file. Note that " "stripping the sequence will " "also strip the quality values [%default]") parser.add_option("--unstrip", dest="unstrip", action="store_true", help="add sequence and quality into bam file [%default]") parser.add_option("--filter", dest="filter", action="append", type="choice", choices=('NM', 'CM', 'mapped', 'unique', "non-unique"), help = "filter bam file. The option denotes " "the property that is " "used to determine better match [%default]") parser.add_option("--reference-bam", dest="reference_bam", type="string", help="bam-file to filter with [%default]") parser.add_option("--force", dest="force", action="store_true", help="force processing. Some methods such " "as strip/unstrip will stop processing if " "they think it not necessary " "[%default]") parser.add_option("--sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.add_option("--inplace", dest="inplace", action="store_true", help="modify bam files in-place. Bam files need " "to be given " "as arguments. Temporary bam files are written " "to /tmp [%default]") parser.add_option("--fastq1", "-1", dest="fastq_pair1", type="string", help="fastq file with read information for first " "in pair or unpaired [%default]") parser.add_option("--fastq2", "-2", dest="fastq_pair2", type="string", help="fastq file with read information for second " "in pair [%default]") parser.add_option("--keep-first-base", dest="keep_first_base", action="store_true", help="keep first base of reads such that gtf2table.py " "will only consider the " "first base in its counts.") parser.set_defaults( filter=[], set_nh=False, unset_unmapped_mapq=False, output_sam=False, reference_bam=None, strip=None, unstrip=None, force=False, set_sequence=False, inplace=False, fastq_pair1=None, fastq_pair2=None, keep_first_base=False ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) bamfiles = [] if options.stdin != sys.stdin: bamfiles.append(options.stdin.name) if options.inplace: bamfiles.extend(args) if len(bamfiles) == 0: raise ValueError( "please one or more bam-files as command line arguments") if "-" in bamfiles: raise ValueError( "can not read from stdin if ``--inplace`` is selected") if len(bamfiles) == 0: bamfiles = ["-"] for bamfile in bamfiles: E.info('processing %s' % bamfile) if os.path.islink(bamfile): E.warn('ignoring link %s' % bamfile) continue if IOTools.isEmpty(bamfile): E.warn('ignoring empty file %s' % bamfile) continue # reading bam from stdin does not work with only the "r" tag pysam_in = pysam.Samfile(bamfile, "rb") if bamfile == "-": if options.output_sam: pysam_out = pysam.Samfile("-", "wh", template=pysam_in) else: pysam_out = pysam.Samfile("-", "wb", template=pysam_in) else: if IOTools.isEmpty(bamfile): E.warn('skipping empty file %s' % bamfile) continue tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp") tmpfile.close() E.debug("writing temporary bam-file to %s" % tmpfile.name) pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in) if options.filter: remove_mismatches, colour_mismatches = False, False if "NM" in options.filter: remove_mismatches = True elif "CM" in options.filter: remove_mismatches = True colour_mismatches = True if remove_mismatches: if not options.reference_bam: raise ValueError( "requiring reference bam file for removing by " "mismatches") pysam_ref = pysam.Samfile(options.reference_bam, "rb") else: pysam_ref = None # filter and flags are the opposite way around c = _bam2bam.filter_bam( pysam_in, pysam_out, pysam_ref, remove_nonunique="unique" in options.filter, remove_unique="non-unique" in options.filter, remove_contigs=None, remove_unmapped="mapped" in options.filter, remove_mismatches=remove_mismatches, colour_mismatches=colour_mismatches) options.stdlog.write("category\tcounts\n%s\n" % c.asTable()) else: # set up the modifying iterators it = pysam_in.fetch(until_eof=True) # function to check if processing should start pre_check_f = lambda x: None if options.unset_unmapped_mapq: def unset_unmapped_mapq(i): for read in i: if read.is_unmapped: read.mapq = 0 yield read it = unset_unmapped_mapq(it) if options.set_nh and False: def set_nh(i): for key, reads in itertools.groupby(i, lambda x: x.qname): l = list(reads) nh = len(l) for read in l: if not read.is_unmapped: t = dict(read.tags) t['NH'] = nh read.tags = list(t.iteritems()) yield read it = set_nh(it) if options.set_sequence: def set_sequence(i): for read in i: # can't get at length of unmapped reads if read.is_unmapped: read.seq = "A" read.qual = "F" else: read.seq = "A" * read.inferred_length read.qual = "F" * read.inferred_length yield read it = set_sequence(it) if options.strip is not None: def strip_sequence(i): for read in i: read.seq = None yield read def check_sequence(reads): if reads[0].seq is None: return 'no sequence present' return None def strip_quality(i): for read in i: read.qual = None yield read def check_quality(reads): if reads[0].qual is None: return 'no quality information present' return None def strip_match(i): for read in i: try: nm = read.opt('NM') except KeyError: nm = 1 if nm == 0: read.seq = None yield read if options.strip == "sequence": it = strip_sequence(it) pre_check_f = check_sequence elif options.strip == "quality": it = strip_quality(it) pre_check_f = check_quality elif options.strip == "match": it = strip_match(it) if options.unstrip: def buildReadDictionary(filename): if not os.path.exists(filename): raise OSError("file not found: %s" % filename) fastqfile = pysam.Fastqfile(filename) fastq2sequence = {} for x in fastqfile: if x.name in fastq2sequence: raise ValueError( "read %s duplicate - can not unstrip" % x.name) fastq2sequence[x.name] = (x.sequence, x.quality) return fastq2sequence if not options.fastq_pair1: raise ValueError( "please supply fastq file(s) for unstripping") fastq2sequence1 = buildReadDictionary(options.fastq_pair1) if options.fastq_pair2: fastq2sequence2 = buildReadDictionary(options.fastq_pair2) def unstrip_unpaired(i): for read in i: read.seq, read.qual = fastq2sequence1[read.qname] yield read def unstrip_pair(i): for read in i: if read.is_read1: read.seq, read.qual = fastq2sequence1[read.qname] else: read.seq, read.qual = fastq2sequence2[read.qname] yield read if options.fastq_pair2: it = unstrip_pair(it) else: it = unstrip_unpaired(it) if options.set_nh: it = _bam2bam.SetNH(it) # keep first base of reads by changing the cigarstring to # '1M' and, in reads mapping to the reverse strand, # changes the pos to aend - 1 if options.keep_first_base: def keep_first_base(i): for read in i: if read.is_reverse: read.pos = read.aend - 1 read.cigarstring = '1M' elif not read.is_unmapped: read.cigarstring = '1M' yield read it = keep_first_base(it) # read first read and check if processing should continue # only possible when not working from stdin if bamfile != "-": # get first read for checking pre-conditions first_reads = list(pysam_in.head(1)) msg = pre_check_f(first_reads) if msg is not None: if options.force: E.warn('proccessing continues, though: %s' % msg) else: E.warn('processing not started: %s' % msg) pysam_in.close() pysam_out.close() continue # continue processing till end for read in it: pysam_out.write(read) pysam_in.close() pysam_out.close() if options.inplace: # set date and file permissions according to original # Note: currently it will not update user and group. original = os.stat(bamfile) os.utime(tmpfile.name, (original.st_atime, original.st_mtime)) os.chmod(tmpfile.name, original.st_mode) # move new file over original copy shutil.move(tmpfile.name, bamfile) # re-index pysam.index(bamfile) # write footer and output benchmark information. E.Stop()
def buildPseudogenes(infiles, outfile, dbhandle): '''build a set of pseudogenes. Transcripts are extracted from the GTF file and designated as pseudogenes if: * the gene_type or transcript_type contains the phrase "pseudo". This taken is from the database. * the feature is 'processed_transcript' and has similarity to protein coding genes. Similarity is assessed by aligning the transcript and peptide set against each other with exonerate_. Pseudogenic transcripts can overlap with protein coding transcripts. Arguments --------- infiles : list Filenames of ENSEMBL geneset in :term:`gtf` format and associated peptide sequences in :term:`fasta` format. outfile : filename Output in :term:`gtf` format with inferred or annotated pseudogenes. dbandle : object Database handle for extracting transcript biotypes. ''' infile_gtf, infile_peptides_fasta = infiles # JJ - there are also 'nontranslated_CDS', but no explanation of these if PARAMS["genome"].startswith("dm"): E.warn("Ensembl dm genome annotations only contain source" " 'pseudogenes' - skipping exonerate step") statement = """zcat %(infile_gtf)s |awk '$2 ~ /pseudogene/' | gzip > %(outfile)s""" P.run() return tmpfile1 = P.getTempFilename(shared=True) # collect processed transcripts and save as fasta sequences statement = ''' zcat %(infile_gtf)s | awk '$2 ~ /processed/' | python %(scriptsdir)s/gff2fasta.py --is-gtf --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log > %(tmpfile1)s ''' P.run() if IOTools.isEmpty(tmpfile1): E.warn("no pseudogenes found") os.unlink(tmpfile1) P.touch(outfile) return model = "protein2dna" # map processed transcripts against peptide sequences statement = ''' cat %(tmpfile1)s | %(cmd-farm)s --split-at-regex=\"^>(\S+)\" --chunk-size=100 --log=%(outfile)s.log "exonerate --target %%STDIN%% --query %(infile_peptides_fasta)s --model %(model)s --bestn 1 --score 200 --ryo \\"%%qi\\\\t%%ti\\\\t%%s\\\\n\\" --showalignment no --showsugar no --showcigar no --showvulgar no " | grep -v -e "exonerate" -e "Hostname" | gzip > %(outfile)s.links.gz ''' P.run() os.unlink(tmpfile1) inf = IOTools.openFile("%s.links.gz" % outfile) best_matches = {} for line in inf: peptide_id, transcript_id, score = line[:-1].split("\t") score = int(score) if transcript_id in best_matches and \ best_matches[transcript_id][0] > score: continue best_matches[transcript_id] = (score, peptide_id) inf.close() E.info("found %i best links" % len(best_matches)) new_pseudos = set(best_matches.keys()) cc = dbhandle.cursor() known_pseudos = set([x[0] for x in cc.execute( """SELECT DISTINCT transcript_id FROM transcript_info WHERE transcript_biotype like '%pseudo%' OR gene_biotype like '%pseudo%' """)]) E.info("pseudogenes from: processed_transcripts=%i, known_pseudos=%i, " "intersection=%i" % ( (len(new_pseudos), len(known_pseudos), len(new_pseudos.intersection(known_pseudos))))) all_pseudos = new_pseudos.union(known_pseudos) c = E.Counter() outf = IOTools.openFile(outfile, "w") inf = GTF.iterator(IOTools.openFile(infile_gtf)) for gtf in inf: c.input += 1 if gtf.transcript_id not in all_pseudos: continue c.output += 1 outf.write("%s\n" % gtf) outf.close() E.info("exons: %s" % str(c))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--methods", dest="methods", type="choice", action="append", choices=("filter", "keep-first-base", "set-nh", "set-sequence", "strip-sequence", "strip-quality", "unstrip", "unset-unmapped-mapq", "downsample-single", "downsample-paired"), help="methods to apply [%default]") parser.add_option("--strip-method", dest="strip_method", type="choice", choices=("all", "match"), help="define which sequences/qualities to strip. " "match means that stripping only applies to entries " "without mismatches (requires NM tag to be present). " "[%default]") parser.add_option("--filter-method", dest="filter_methods", action="append", type="choice", choices=('NM', 'CM', 'mapped', 'unique', "non-unique"), help="filter method to apply to remove alignments " "from a bam file. Multiple methods can be supplied " "[%default]") parser.add_option("--reference-bam-file", dest="reference_bam", type="string", help="bam-file to filter with [%default]") parser.add_option("--force-output", dest="force", action="store_true", help="force processing. Some methods such " "as strip/unstrip will stop processing if " "they think it not necessary " "[%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.add_option("--inplace", dest="inplace", action="store_true", help="modify bam files in-place. Bam files need " "to be given " "as arguments. Temporary bam files are written " "to /tmp [%default]") parser.add_option("--first-fastq-file", "-1", dest="fastq_pair1", type="string", help="fastq file with read information for first " "in pair or unpaired. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--second-fastq-file", "-2", dest="fastq_pair2", type="string", help="fastq file with read information for second " "in pair. Used for unstripping sequence " "and quality scores [%default]") parser.add_option("--downsample", dest="downsample", type="int", help="Number of reads to downsample to") parser.set_defaults(methods=[], output_sam=False, reference_bam=None, filter_methods=[], strip_method="all", force=False, inplace=False, fastq_pair1=None, fastq_pair2=None, downsample=None, random_seed=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # random.seed(options.random_seed) bamfiles = [] if options.stdin != sys.stdin: from_stdin = True bamfiles.append(options.stdin.name) else: from_stdin = False if options.inplace: bamfiles.extend(args) if len(bamfiles) == 0: raise ValueError( "please one or more bam-files as command line arguments") if "-" in bamfiles: raise ValueError( "can not read from stdin if ``--inplace`` is selected") if len(bamfiles) == 0: bamfiles = ["-"] to_stdout = False for bamfile in bamfiles: E.info('processing %s' % bamfile) if os.path.islink(bamfile): E.warn('ignoring link %s' % bamfile) continue if IOTools.isEmpty(bamfile): E.warn('ignoring empty file %s' % bamfile) continue # reading bam from stdin does not work with only the "r" tag pysam_in = pysam.AlignmentFile(bamfile, "rb") if bamfile == "-" or (from_stdin and bamfile == options.stdin.name): to_stdout = True if options.output_sam: pysam_out = pysam.AlignmentFile("-", "wh", template=pysam_in) else: pysam_out = pysam.AlignmentFile("-", "wb", template=pysam_in) else: if IOTools.isEmpty(bamfile): E.warn('skipping empty file %s' % bamfile) continue tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp") tmpfile.close() E.debug("writing temporary bam-file to %s" % tmpfile.name) pysam_out = pysam.AlignmentFile(tmpfile.name, "wb", template=pysam_in) if "filter" in options.methods: remove_mismatches, colour_mismatches = False, False if "NM" in options.filter_methods: remove_mismatches = True elif "CM" in options.filter_methods: remove_mismatches = True colour_mismatches = True if remove_mismatches: if not options.reference_bam: raise ValueError( "requiring reference bam file for removing by " "mismatches") pysam_ref = pysam.AlignmentFile(options.reference_bam, "rb") else: pysam_ref = None # filter and flags are the opposite way around c = _bam2bam.filter_bam(pysam_in, pysam_out, pysam_ref, remove_nonunique="unique" in options.filter_methods, remove_unique="non-unique" in options.filter_methods, remove_contigs=None, remove_unmapped="mapped" in options.filter_methods, remove_mismatches=remove_mismatches, colour_mismatches=colour_mismatches) if pysam_ref: pysam_ref.close() # do not write to stdlog in the middle of a SAM/BAM stdout stream. if options.stdlog != options.stdout: E.info("category\tcounts\n%s\n" % c.asTable()) else: # set up the modifying iterators it = pysam_in.fetch(until_eof=True) # function to check if processing should start pre_check_f = lambda x: None if "unset-unmapped-mapq" in options.methods: def unset_unmapped_mapq(i): for read in i: if read.is_unmapped: read.mapq = 0 yield read it = unset_unmapped_mapq(it) if "set-sequence" in options.methods: def set_sequence(i): for read in i: # can't get at length of unmapped reads if read.is_unmapped: read.seq = "A" read.qual = "F" else: read.seq = "A" * read.inferred_length read.qual = "F" * read.inferred_length yield read it = set_sequence(it) if "strip-sequence" in options.methods or "strip-quality" in \ options.methods: def strip_sequence(i): for read in i: read.seq = None yield read def check_sequence(reads): if reads[0].seq is None: return 'no sequence present' return None def strip_quality(i): for read in i: read.qual = None yield read def check_quality(reads): if reads[0].qual is None: return 'no quality information present' return None def strip_match(i): for read in i: try: nm = read.opt('NM') except KeyError: nm = 1 if nm == 0: read.seq = None yield read if options.strip_method == "all": if "strip-sequence" in options.methods: it = strip_sequence(it) pre_check_f = check_sequence elif "strip-quality" in options.methods: it = strip_quality(it) pre_check_f = check_quality elif options.strip_method == "match": it = strip_match(it) if "unstrip" in options.methods: def buildReadDictionary(filename): if not os.path.exists(filename): raise OSError("file not found: %s" % filename) fastqfile = pysam.FastxFile(filename) fastq2sequence = {} for x in fastqfile: if x.name in fastq2sequence: raise ValueError( "read %s duplicate - can not unstrip" % x.name) fastq2sequence[x.name] = (x.sequence, x.quality) return fastq2sequence if not options.fastq_pair1: raise ValueError( "please supply fastq file(s) for unstripping") fastq2sequence1 = buildReadDictionary(options.fastq_pair1) if options.fastq_pair2: fastq2sequence2 = buildReadDictionary(options.fastq_pair2) def unstrip_unpaired(i): for read in i: read.seq, read.qual = fastq2sequence1[read.qname] yield read def unstrip_pair(i): for read in i: if read.is_read1: read.seq, read.qual = fastq2sequence1[read.qname] else: read.seq, read.qual = fastq2sequence2[read.qname] yield read if options.fastq_pair2: it = unstrip_pair(it) else: it = unstrip_unpaired(it) if "set-nh" in options.methods: it = _bam2bam.SetNH(it) # keep first base of reads by changing the cigarstring to # '1M' and, in reads mapping to the reverse strand, # changes the pos to aend - 1 # Needs to be refactored to make it more general # (last base, midpoint, ..) if "keep_first_base" in options.methods: def keep_first_base(i): for read in i: if read.is_reverse: read.pos = read.aend - 1 read.cigarstring = '1M' elif not read.is_unmapped: read.cigarstring = '1M' yield read it = keep_first_base(it) # read first read and check if processing should continue # only possible when not working from stdin # Refactoring: use cache to also do a pre-check for # stdin input. if bamfile != "-": # get first read for checking pre-conditions first_reads = list(pysam_in.head(1)) msg = pre_check_f(first_reads) if msg is not None: if options.force: E.warn('proccessing continues, though: %s' % msg) else: E.warn('processing not started: %s' % msg) pysam_in.close() pysam_out.close() continue if "downsample-single" in options.methods: if not options.downsample: raise ValueError("Please provide downsample size") else: down = SubsetBam(pysam_in=it, downsample=options.downsample, paired_end=None, single_end=True, random_seed=options.random_seed) it = down.downsample_single() if "downsample-paired" in options.methods: if not options.downsample: raise ValueError("Please provide downsample size") else: down = SubsetBam(pysam_in=it, downsample=options.downsample, paired_end=True, single_end=None, random_seed=options.random_seed) it = down.downsample_paired() # continue processing till end for read in it: pysam_out.write(read) pysam_in.close() pysam_out.close() if options.inplace: # set date and file permissions according to original # Note: currently it will not update user and group. original = os.stat(bamfile) os.utime(tmpfile.name, (original.st_atime, original.st_mtime)) os.chmod(tmpfile.name, original.st_mode) # move new file over original copy shutil.move(tmpfile.name, bamfile) # re-index pysam.index(bamfile) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--methods", dest="methods", type="choice", action="append", choices=("filter", "keep-first-base", "set-nh", "set-sequence", "strip-sequence", "strip-quality", "unstrip", "unset-unmapped-mapq"), help="methods to apply [%default]") parser.add_option("--strip-method", dest="strip_method", type="choice", choices=("all", "match"), help="define which sequences/qualities to strip. " "match means that stripping only applies to entries " "without mismatches (requires NM tag to be present). " "[%default]") parser.add_option("--filter-method", dest="filter_methods", action="append", type="choice", choices=('NM', 'CM', 'mapped', 'unique', "non-unique"), help="filter method to apply to remove alignments " "from a bam file. Multiple methods can be supplied " "[%default]") parser.add_option("--reference-bam-file", dest="reference_bam", type="string", help="bam-file to filter with [%default]") parser.add_option("--force-output", dest="force", action="store_true", help="force processing. Some methods such " "as strip/unstrip will stop processing if " "they think it not necessary " "[%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.add_option("--inplace", dest="inplace", action="store_true", help="modify bam files in-place. Bam files need " "to be given " "as arguments. Temporary bam files are written " "to /tmp [%default]") parser.add_option( "--first-fastq-file", "-1", dest="fastq_pair1", type="string", help="fastq file with read information for first " "in pair or unpaired. Used for unstripping sequence " "and quality scores [%default]") parser.add_option( "--second-fastq-file", "-2", dest="fastq_pair2", type="string", help="fastq file with read information for second " "in pair. Used for unstripping sequence " "and quality scores [%default]") parser.set_defaults( methods=[], output_sam=False, reference_bam=None, filter_methods=[], strip_method="all", force=False, inplace=False, fastq_pair1=None, fastq_pair2=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) bamfiles = [] if options.stdin != sys.stdin: bamfiles.append(options.stdin.name) if options.inplace: bamfiles.extend(args) if len(bamfiles) == 0: raise ValueError( "please one or more bam-files as command line arguments") if "-" in bamfiles: raise ValueError( "can not read from stdin if ``--inplace`` is selected") if len(bamfiles) == 0: bamfiles = ["-"] for bamfile in bamfiles: E.info('processing %s' % bamfile) if os.path.islink(bamfile): E.warn('ignoring link %s' % bamfile) continue if IOTools.isEmpty(bamfile): E.warn('ignoring empty file %s' % bamfile) continue # reading bam from stdin does not work with only the "r" tag pysam_in = pysam.Samfile(bamfile, "rb") if bamfile == "-": if options.output_sam: pysam_out = pysam.Samfile("-", "wh", template=pysam_in) else: pysam_out = pysam.Samfile("-", "wb", template=pysam_in) else: if IOTools.isEmpty(bamfile): E.warn('skipping empty file %s' % bamfile) continue tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp") tmpfile.close() E.debug("writing temporary bam-file to %s" % tmpfile.name) pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in) if "filter" in options.methods: remove_mismatches, colour_mismatches = False, False if "NM" in options.filter_methods: remove_mismatches = True elif "CM" in options.filter_methods: remove_mismatches = True colour_mismatches = True if remove_mismatches: if not options.reference_bam: raise ValueError( "requiring reference bam file for removing by " "mismatches") pysam_ref = pysam.Samfile(options.reference_bam, "rb") else: pysam_ref = None # filter and flags are the opposite way around c = _bam2bam.filter_bam( pysam_in, pysam_out, pysam_ref, remove_nonunique="unique" in options.filter_methods, remove_unique="non-unique" in options.filter_methods, remove_contigs=None, remove_unmapped="mapped" in options.filter_methods, remove_mismatches=remove_mismatches, colour_mismatches=colour_mismatches) options.stdlog.write("category\tcounts\n%s\n" % c.asTable()) else: # set up the modifying iterators it = pysam_in.fetch(until_eof=True) # function to check if processing should start pre_check_f = lambda x: None if "unset-unmapped-mapq" in options.methods: def unset_unmapped_mapq(i): for read in i: if read.is_unmapped: read.mapq = 0 yield read it = unset_unmapped_mapq(it) if "set-sequence" in options.methods: def set_sequence(i): for read in i: # can't get at length of unmapped reads if read.is_unmapped: read.seq = "A" read.qual = "F" else: read.seq = "A" * read.inferred_length read.qual = "F" * read.inferred_length yield read it = set_sequence(it) if "strip-sequence" in options.methods or "strip-quality" in \ options.methods: def strip_sequence(i): for read in i: read.seq = None yield read def check_sequence(reads): if reads[0].seq is None: return 'no sequence present' return None def strip_quality(i): for read in i: read.qual = None yield read def check_quality(reads): if reads[0].qual is None: return 'no quality information present' return None def strip_match(i): for read in i: try: nm = read.opt('NM') except KeyError: nm = 1 if nm == 0: read.seq = None yield read if options.strip_method == "all": if "strip-sequence" in options.methods: it = strip_sequence(it) pre_check_f = check_sequence elif "strip-quality" in options.methods: it = strip_quality(it) pre_check_f = check_quality elif options.strip_method == "match": it = strip_match(it) if "unstrip" in options.methods: def buildReadDictionary(filename): if not os.path.exists(filename): raise OSError("file not found: %s" % filename) fastqfile = pysam.FastxFile(filename) fastq2sequence = {} for x in fastqfile: if x.name in fastq2sequence: raise ValueError( "read %s duplicate - can not unstrip" % x.name) fastq2sequence[x.name] = (x.sequence, x.quality) return fastq2sequence if not options.fastq_pair1: raise ValueError( "please supply fastq file(s) for unstripping") fastq2sequence1 = buildReadDictionary(options.fastq_pair1) if options.fastq_pair2: fastq2sequence2 = buildReadDictionary(options.fastq_pair2) def unstrip_unpaired(i): for read in i: read.seq, read.qual = fastq2sequence1[read.qname] yield read def unstrip_pair(i): for read in i: if read.is_read1: read.seq, read.qual = fastq2sequence1[read.qname] else: read.seq, read.qual = fastq2sequence2[read.qname] yield read if options.fastq_pair2: it = unstrip_pair(it) else: it = unstrip_unpaired(it) if "set-nh" in options.methods: it = _bam2bam.SetNH(it) # keep first base of reads by changing the cigarstring to # '1M' and, in reads mapping to the reverse strand, # changes the pos to aend - 1 # Needs to be refactored to make it more general # (last base, midpoint, ..) if "keep_first_base" in options.methods: def keep_first_base(i): for read in i: if read.is_reverse: read.pos = read.aend - 1 read.cigarstring = '1M' elif not read.is_unmapped: read.cigarstring = '1M' yield read it = keep_first_base(it) # read first read and check if processing should continue # only possible when not working from stdin # Refactoring: use cache to also do a pre-check for # stdin input. if bamfile != "-": # get first read for checking pre-conditions first_reads = list(pysam_in.head(1)) msg = pre_check_f(first_reads) if msg is not None: if options.force: E.warn('proccessing continues, though: %s' % msg) else: E.warn('processing not started: %s' % msg) pysam_in.close() pysam_out.close() continue # continue processing till end for read in it: pysam_out.write(read) pysam_in.close() pysam_out.close() if options.inplace: # set date and file permissions according to original # Note: currently it will not update user and group. original = os.stat(bamfile) os.utime(tmpfile.name, (original.st_atime, original.st_mtime)) os.chmod(tmpfile.name, original.st_mode) # move new file over original copy shutil.move(tmpfile.name, bamfile) # re-index pysam.index(bamfile) # write footer and output benchmark information. E.Stop()
def loadZinba(infile, outfile, bamfile, tablename=None, controlfile=None): '''load Zinba results in *tablename* This method loads only positive peaks. It filters peaks by p-value, q-value and fold change and loads the diagnostic data and re-calculates peakcenter, peakval, ... using the supplied bamfile. If *tablename* is not given, it will be :file:`<track>_intervals` where track is derived from ``infile`` and assumed to end in :file:`.zinba`. If no peaks were predicted, an empty table is created. This method creates :file:`<outfile>.tsv.gz` with the results of the filtering. This method uses the refined peak locations. Zinba peaks can be overlapping. This method does not merge overlapping intervals. Zinba calls peaks in regions where there are many reads inside the control. Thus this method applies a filtering step removing all intervals in which there is a peak of more than readlength / 2 height in the control. .. note: Zinba calls peaks that are overlapping. ''' track = P.snip(os.path.basename(infile), ".zinba") folder = os.path.dirname(infile) infilename = infile + ".peaks" outtemp = P.getTempFile(".") tmpfilename = outtemp.name outtemp.write("\t".join(( "interval_id", "contig", "start", "end", "npeaks", "peakcenter", "length", "avgval", "peakval", "nprobes", "pvalue", "fold", "qvalue", "macs_summit", "macs_nprobes", )) + "\n") counter = E.Counter() if not os.path.exists(infilename): E.warn("could not find %s" % infilename) elif IOTools.isEmpty(infilename): E.warn("no data in %s" % infilename) else: # filter peaks shift = getPeakShiftFromZinba(infile) assert shift is not None, \ "could not determine peak shift from Zinba file %s" % infile E.info("%s: found peak shift of %i" % (track, shift)) samfiles = [pysam.Samfile(bamfile, "rb")] offsets = [shift / 2] if controlfile: controlfiles = [pysam.Samfile(controlfile, "rb")] readlength = BamTools.estimateTagSize(controlfile) control_max_peakval = readlength // 2 E.info("removing intervals in which control has peak higher than %i reads" % control_max_peakval) else: controlfiles = None id = 0 # get thresholds max_qvalue = float(PARAMS["zinba_fdr_threshold"]) with IOTools.openFile(infilename, "r") as ins: for peak in WrapperZinba.iteratePeaks(ins): # filter by qvalue if peak.fdr > max_qvalue: counter.removed_qvalue += 1 continue assert peak.refined_start < peak.refined_end # filter by control if controlfiles: npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig, peak.refined_start, peak.refined_end, controlfiles, offsets) if peakval > control_max_peakval: counter.removed_control += 1 continue # output peak npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig, peak.refined_start, peak.refined_end, samfiles, offsets) outtemp.write("\t".join(map(str, ( id, peak.contig, peak.refined_start, peak.refined_end, npeaks, peakcenter, length, avgval, peakval, nreads, 1.0 - peak.posterior, 1.0, peak.fdr, peak.refined_start + peak.summit - 1, peak.height))) + "\n") id += 1 counter.output += 1 outtemp.close() # output filtering summary outf = IOTools.openFile("%s.tsv.gz" % outfile, "w") outf.write("category\tcounts\n") outf.write("%s\n" % counter.asTable()) outf.close() E.info("%s filtering: %s" % (track, str(counter))) if counter.output == 0: E.warn("%s: no peaks found" % track) # load data into table if tablename is None: tablename = "%s_intervals" % track statement = ''' cgat csv2db %(csv2db_options)s --allow-empty-file --add-index=interval_id --add-index=contig,start --table=%(tablename)s < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfilename)
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' to_cluster = True # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.isEmpty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise P.PipelineError( "control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.getTempDir(".") tmpfile = P.getTempFilename(".") for motiffile in motiffiles: if IOTools.isEmpty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() statement = "gzip < %(tmpfile)s > %(outfile)s" P.run() shutil.rmtree(tmpdir) os.unlink(tmpfile)