def Compute_MAD(a, c=0.6745, axis=None): """ Median Absolute Deviation along given axis of an array: median(abs(a - median(a))) / c c = 0.6745 is the constant to convert from MAD to std; it is used by default Copied from http://code.google.com/p/agpy/source/browse/trunk/agpy/mad.py Downloaded 7-Dec-2012. """ LE.debug("Computing MAD for {0} #elements ({1}bytes)".format( len(a), sys.getsizeof(a))) d = VcfAnnotator.Median(a) nary = array.array('d') for i in a: nary.append(float(abs(i - d)) / c) nary = list(nary) nary.sort() mad = VcfAnnotator.Median(nary) del nary return mad
def merge(self): LE.debug("Doing merge, writing in " + self.output) filestomerge = [pysam.Samfile(i) for i in self.newsams] newHeader = {} rgs = {} for j in itertools.chain(*[i.header["RG"] for i in filestomerge]): rgs.setdefault(j["ID"], j) sqs = OrderedDict() for j in itertools.chain(*[i.header["SQ"] for i in filestomerge]): sqs.setdefault(j["SN"], j) newHeader["HD"] = filestomerge[0].header["HD"] newHeader["RG"] = rgs.values() newHeader["SQ"] = sqs.values() newHeader["CO"] = list( itertools.chain(*[i.header["CO"] for i in filestomerge])) pgs = list(itertools.chain(*[i.header["PG"] for i in filestomerge])) for i in pgs: newHeader["CO"].append("\t".join([":".join(k) for k in i.items()])) newHeader["CO"] = list(set(newHeader["CO"])) newHeader["CO"].append("CMD:{0}".format(" ".join(sys.argv))) for i in self.commandsHistory: newHeader["CO"].append("CMD:{0}".format(i)) outBam = pysam.Samfile(self.output, "wb", header=newHeader) for j in filestomerge: for i in j: outBam.write(i) outBam.close() pysam.sort(self.output, self.output) shutil.move(self.output + ".bam", self.output)
def Index_Fasta( self ): """ Create/update .fasta.fai file using samtools faidx. """ LE.info('Creating index {0}.fai.'.format(self.outfastapath)) self.cleanUpExecution( *COMPASSCFG['tools']['samtools'].execute(append="faidx {0}".format(self.outfastapath)))
def substitutePars(self, cad): vardict = dict([(i, getattr(self, i)) for i in dir(self) if i.startswith("mpileup") or i.startswith("bcftools") or i.startswith("opts")]) LE.debug("Running command: [{0}]".format(cad.format(**vardict))) cmdStr = cad.format(**vardict) return cmdStr
def merge(self): LE.info("Merging SAMfiles from different readgroup mappings") input = pysam.Samfile(self.input) self.seqstat = self.generateSeqStats(self.input) newheaders = dict(input.header.items()) # samheader = pysam.Samfile( # self.tmpdir + "/" + self.readgroups[0] + ".sam") samheader = input newheaders["SQ"] = samheader.header["SQ"] samheader.close() input.close() newheaders["PG"] = [{ "PN": "bwa", "VN": "0.7.10", "CL": self.commandsHistory[0] }] if "CO" in newheaders.keys(): newheaders["CO"] = list(set(newheaders["CO"])) else: newheaders["CO"] = [] newheaders["CO"].append("CMD:{0}".format(" ".join(sys.argv))) for i in self.commandsHistory: newheaders["CO"].append("CMD:{0}".format(i)) LE.debug("Doing merge, writing in " + self.output) unsortedBamName = self.output + "_unsorted.bam" # print unsortedBamName output = pysam.Samfile(unsortedBamName, "wb", header=newheaders) # print "========" # print self.readgroups # print "========" for i in self.readgroups: with pysam.Samfile(i + "_alignment.sam") as source: for j in source: if not j.flag & 2048: output.write(j) # os.unlink(self.tmpdir + "/" + i + ".sam") output.close() outputNames = self.output.split(".") outputPrefix = ".".join(outputNames[0:len(outputNames) - 1]) pysam.sort(unsortedBamName, self.output) os.unlink(unsortedBamName) shutil.move(self.output + ".bam", self.output)
def cleanUpExecution( self, cmd, stdout, stderr, errcode ): """ Args: cmd: stdout: stderr: errcode: """ LE.debug(StringIO(stdout)) if errcode: LE.error(StringIO(stderr)) raise Exception( "CMD [{0}] exit with status [{1}]".format(cmd, errcode))
def map(self): sf = pysam.Samfile(self.input) rgs = [i['ID'] for i in sf.header["RG"]] sf.close() self.newsams = [ os.path.join(os.getcwd(), str(uuid.uuid4())) + ".sam" for i in range(len(rgs)) ] for readgroup, samout in zip(rgs, self.newsams): append = "--substitutionrate={0} -g {1} -h {1} -M {2} -o {3} --logfile={3}.log --readgroup=ID:{4} --outputformat=sam -v 3 ".format( self.subrate, self.ref, self.input, samout, readgroup) if self.keepgoodreads: stampyCmd += " --bamkeepgoodreads " if self.alignquals: stampyCmd += " --alignquals " if self.baq: stampyCmd += " --baq " cmd, stdout, stderr, errcode = COMPASSCFG["tools"][ "stampy"].execute(append=append) cmd, stdout, stderr, errcode = "", "", "", 0 LE.debug(StringIO(stdout), "stdout") LE.debug(StringIO(stderr), "stderr") if errcode: LE.critical("Stampy execution failed {0}".format(errcode)) raise Exception("Stampy execution failed {0}".format(errcode)) self.insertsizes[readgroup] = self.generateSeqStats(samout)
def markDuplicates(self): print COMPASSCFG cmd, stdout, stderr, errcode = COMPASSCFG["tools"]["picard"].execute( source="path", prepend="java -jar", file="MarkDuplicates.jar", append= "I={0} O={1}.dedup METRICS_FILE=metrics.txt ASSUME_SORTED=true VERBOSITY=DEBUG VALIDATION_STRINGENCY=SILENT" .format(self.output, self.output)) LE.debug(StringIO(stdout), "stdout") LE.debug(stdout, "stdout") LE.debug(stderr, "stderr") LE.debug(StringIO(stderr), "stderr") if errcode: LE.critical("MarkDuplicates execution failed {0}".format( cmd.returncode)) raise Exception("MarkDuplicates execution failed {0}".format( cmd.returncode)) # shutil.move(self.output + ".dedup", self.output)
def Fix_Fasta_Headers( self ): """ Create correctly formatted fasta file. Contigs must be in the form REFID[, REFID-2, REFID-3,...]. """ LE.info('Creating master fasta file {0}.'.format(self.outfastapath)) # helper function to reformat each fasta record on the fly def _fixed_records(): """ """ for i, contig in enumerate(SeqIO.parse(self.infasta, 'fasta'), 1): correct_name = contig.id = self.newrefid + \ ('-{0}'.format(i), '')[i == 1] if contig.name != correct_name: contig.name = correct_name contig.description = '{0} {1} {2}'.format( correct_name, self.newrefid, contig.description) yield contig SeqIO.write(_fixed_records(), self.outfastapath, 'fasta') return
def map(self): for i in self.readgroups: fq1 = self.tmpdir + "/" + i + "-A.fq" fq2 = self.tmpdir + "/" + i + "-B.fq" output = open(self.tmpdir + "/" + i + ".sam", "w") p = COMPASSCFG["tools"]["bwa"].popen( append="mem -R '@RG\\tID:{0}' {1} {2} {3} -L 20 -B 3 -O 6 -T 20" .format(i, self.ref, fq1, fq2), stderr=subprocess.PIPE, stdout=output) self.commandsHistory.append(p.cmd) LE.debug(p.stderr, "stderr") errcode = p.wait() if errcode: LE.error("BWA tool failed") raise Exception("BWA tool failed") os.unlink(fq1) os.unlink(fq2) output = self.tmpdir + "/" + i + ".sam" self.insertsizes[i] = self.generateSeqStats(output)
def markDuplicates(self): cmd, stdout, stderr, errcode = COMPASSCFG['tools']['picard'].execute( source='path', file='MarkDuplicates.jar', prepend='java -jar', append= "I={0} O={0}.dedup METRICS_FILE={1}_metrics.txt ASSUME_SORTED=true VERBOSITY=DEBUG VALIDATION_STRINGENCY=SILENT" .format(self.output, self.input)) LE.debug(StringIO(stdout), "stdout") LE.debug(StringIO(stderr), "stderr") if errcode: LE.critical("MarkDuplicates execution failed {0}".format(errcode)) raise Exception( "MarkDuplicates execution failed {0}".format(errcode)) shutil.move(self.output + ".dedup", self.output)
def execCommand( self, cmd ): """ Args: cmd: """ origcmd = cmd LE.debug("Running command: [{0}]".format(cmd)) cmd = shlex.split(cmd) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() errcode = p.wait() LE.debug(StringIO(stdout)) if errcode: LE.error(StringIO(stderr)) raise Exception( "CMD [{0}] exit with status [{1}]".format(origcmd, errcode))
def generateAll( self ): self.Fix_Fasta_Headers() self.Create_Indexes() self.Make_Repeat_Mask_Txt() LE.info("Everything went OKAY!")
parser.add_argument( '-dh', dest="headerinfo", help= "Default ummaped header, you must specify [readgroup,platform,lib,sample,SeqCentre] ex: -dh RG0045,ILLUMINA,LIB03,SN123,Sanger", default=None) parser.add_argument( '-o', dest="output.bam", help="You must specifiy a file with the SAM header in text format", default="-") args = parser.parse_args() # print " ".join(sys.argv) LE.info("Input fastq files in {0} , {1}".format(args.fq1, args.fq2)) LE.info("Output bam in {0}".format(args.output)) if args.headerinfo == "None": args.headerinfo = None if (not args.header and not args.headerinfo) or (args.header and args.headerinfo): print( "You must specify either a header file for the SAM header or default header information (-H/-dh)" ) sys.exit(-1) '''if args.output.bam=="-": output.bam=sys.stdout else: output.bam=open(args.output.bam,"w")'''
parser.add_argument(i[0], dest=i[1], help=i[4], default=False, action='store_true') else: parser.add_argument(i[0], dest=i[1], help=i[4], default="DISABLED") args = parser.parse_args() try: for i in FILTERS.availableFilters(): value = getattr(args, i[1]) if value and value != "DISABLED": FILTERS.setUpFilter(i[1], value) except ParameterError as e: LE.critical("Parameter setup Error: {0}".format(e.message)) dump_exc() try: vcfFile = GormVcf(args.invcf) FILTERS.filterVcf(vcfFile, outvcf_path=args.outvcf, outvcfIndel=args.outvcfIndel, outfasta=args.outfasta, stats=args.outstats, guuid=args.guuid, refid=args.ref_id) print("Done") except: dump_exc()
def Make_Repeat_Mask_Txt( self, word_size=17, gapopen=5, e_thresh=0.0001, perc_identity=90, gapextend=2, min_length=75 ): """ Run blastn on contigs in input fasta file against database dbname. Parameters set to NCBI recommended defaults for blastn. """ outfastapath = os.path.join( self.outdir, '{0}.fasta'.format(self.newrefid)) prefix = os.path.join(self.outdir, self.newrefid) maskpath = prefix + '_repmask.array' regionspath = prefix + '_repregions.array' statspath = prefix + '.stats' blastn_cline = blastn(cmd=COMPASSCFG['tools']['blast']['path'] + "blastn", db=prefix, query=outfastapath, dust='no', word_size=word_size, gapopen=gapopen, gapextend=gapextend, evalue=e_thresh, perc_identity=perc_identity, outfmt='"6 qseqid sseqid pident length qstart qend sstart send"') try: blast_out, blast_err = blastn_cline() assert not blast_err except (AppError, AssertionError) as err: raise Exception( 'Erro: Blast failed during construction of repeat mask : {0}'.format(err)) repmask_fp = open(maskpath, 'w') repregions_fp = open(regionspath, 'w') total_bp = 0 repetitive_bp = 0 num_regions = 0 # each blast_rec is result from one query sequence (contig) blast_stream = StringIO(blast_out) prev_header = None for contig_count, contig in enumerate(SeqIO.parse(outfastapath, 'fasta'), 1): if prev_header != contig.name: repregions_fp.write('>{0}\n'.format(contig.name)) prev_header = contig.name total_bp += len(contig) repmask = np.zeros(len(contig), dtype=np.bool) try: fields = blast_stream.next().split() except StopIteration: fields = None while fields and fields[0] == contig.name: contig_name, match_name = fields[:2] hit_perc_ident = float(fields[2]) hit_length, q_start, q_end, s_start, s_end = ( int(x) for x in fields[3:]) (x1, y1), (x2, y2) = sorted( ((q_start, q_end), sorted((s_start, s_end)))) if hit_length >= min_length and (contig_name != match_name or not (x2 <= x1 <= y2 and x2 <= y1 <= y2)): repmask[q_start - 1:q_end] = True try: fields = blast_stream.next().split() except StopIteration: # end of blast hits fields = None # output.bam repmask as 1 and 0, 100 per line repmask_fp.write('>{0}\n'.format(contig.name)) for i in xrange(0, len(repmask), 100): j = min(i + 100, len(repmask)) repmask_fp.write('{0}\n'.format(''.join(str(i) for i in repmask[i:j].astype(int)))) # identify postitions of repetitive regions (runs of 1s in the # repmask array) # 0-based numbering region_starts = list(np.where(repmask[1:] > repmask[:-1])[0] + 1) region_ends = list(np.where(repmask[1:] < repmask[:-1])[0] + 1) # special case: full blast hit for this contig against another # contig if repmask.all(): region_starts = [0] region_ends = [len(repmask)] # fix ends, in case regions start from the first position in the # sequence or end at the last if region_starts and ((not region_ends) or (region_starts[-1] > region_ends[-1])): region_ends.append(len(repmask)) if region_ends and ((not region_starts) or (region_starts[0] > region_ends[0])): region_starts = [0] + region_starts repregions_fp.writelines('{0}\t{1}\n'.format( rs, re) for rs, re in izip(region_starts, region_ends)) repetitive_bp += repmask.sum() num_regions += len(region_starts) repmask_fp.close() repregions_fp.close() pct_repetitive = '{0:.2f}'.format( (float(repetitive_bp) / total_bp) * 100) LE.debug( 'Info: Repetitive regions for all of {0}: {1}/{2} bp ({3}%)'.format(self.newrefid, repetitive_bp, total_bp, pct_repetitive)) # save result summary statsvalues = '\t'.join((self.newrefid, self.newrefid, str(contig_count), str(total_bp), str(repetitive_bp), str(num_regions), pct_repetitive)) with open(statspath, 'w') as o: o.write('refid\trefcd\tcontigs\tnumbp\trepetitivebp\trepregions\trepetitivepct\n{values}\n'.format( values=statsvalues)) return
def dumpStdError(self): self.stderror.seek(0) LE.error(self.stderror)
def createFQs(self): fileList = {} for i in self.readgroups: fileList[i] = [ open(self.tmpdir + "/" + i + "-1.fq", "w"), open(self.tmpdir + "/" + i + "-2.fq", "w") ] for i in pysam.Samfile(self.input): rg = dict(i.tags)["RG"] if i.flag & 64: fileList[rg][0].write("@{0}\n{1}\n+\n{2}\n".format( i.qname, i.seq, i.qual)) else: fileList[rg][1].write("@{0}\n{1}\n+\n{2}\n".format( i.qname, i.seq, i.qual)) names = os.listdir(self.path) k = 2 for name in names: pattern = re.compile( 'output[0-9]_[0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12}_bam.bam' ) match = re.search(pattern, name) if match: bam = os.path.join(self.path, name) for i in pysam.Samfile(bam): rg = dict(i.tags)["RG"] if i.flag & 64: fileList[rg][0].write("@{0}\n{1}\n+\n{2}\n".format( i.qname, i.seq, i.qual)) else: fileList[rg][1].write("@{0}\n{1}\n+\n{2}\n".format( i.qname, i.seq, i.qual)) k += 1 for i in fileList.values(): i[0].close() i[1].close() for i in self.readgroups: fqSort(FastQReader(self.tmpdir + "/" + i + "-1.fq"), self.tmpdir + "/" + i + "-1.sort") fqSort(FastQReader(self.tmpdir + "/" + i + "-2.fq"), self.tmpdir + "/" + i + "-2.sort") os.unlink(self.tmpdir + "/" + i + "-1.fq") os.unlink(self.tmpdir + "/" + i + "-2.fq") for i in self.readgroups: pn = PairFqNormalizer(self.tmpdir + "/" + i + "-1.sort", self.tmpdir + "/" + i + "-2.sort", FastQWriter(self.tmpdir + "/" + i + "-A.fq"), FastQWriter(self.tmpdir + "/" + i + "-B.fq"), True, 1) pn.normalize() os.unlink(self.tmpdir + "/" + i + "-1.sort") os.unlink(self.tmpdir + "/" + i + "-2.sort") self.fqfiles = [(self.tmpdir + "/" + i + "-A.fq", self.tmpdir + "/" + i + "-B.fq") for i in self.readgroups] LE.debug("FQs created")
calcLDforadjacentsites=options.bcftools_calcLDforadjacentsites, scaledsubstmutrate=options.bcftools_scaledsubstmutrate, indeltosubstratio=options.bcftools_indeltosubstratio, variantifprobltint=options.bcftools_variantifprobltint, typeofprior=options.bcftools_typeofprior, inbam=options.inbam, inref=options.ref_id, pileup_out=options.outpileup) try: c.runPileup() if c.annotate(): print "Error with annotation!" c.dumpStdError() sys.exit(-1) c.merge(options.output) c.clean() LE.info("Finished!") except: c.dumpStdError() c.clean() dump_exc() try: pass c.dumpStdError() c.clean() except: pass dump_exc()