def __init__(self,params) : cleanPyEnv() self.params=params # normalize boolean option input: safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInGermlineCallingModes") safeSetBool(self.params,"enableRemoteReadRetrievalForInsertionsInCancerCallingModes") safeSetBool(self.params,"useOverlapPairEvidence") # Use RNA option for minCandidate size if self.params.isRNA: self.params.minCandidateVariantSize = self.params.rnaMinCandidateVariantSize # format bam lists: if self.params.normalBamList is None : self.params.normalBamList = [] if self.params.tumorBamList is None : self.params.tumorBamList = [] # make sure run directory is setup: self.params.runDir=os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir=os.path.join(self.params.runDir,"workspace") ensureDir(self.params.workDir) # all finalized pretty results get transfered to resultsDir self.params.resultsDir=os.path.join(self.params.runDir,"results") ensureDir(self.params.resultsDir) self.params.statsDir=os.path.join(self.params.resultsDir,"stats") ensureDir(self.params.statsDir) self.params.variantsDir=os.path.join(self.params.resultsDir,"variants") ensureDir(self.params.variantsDir) self.params.evidenceDir=os.path.join(self.params.resultsDir,"evidence") ensureDir(self.params.evidenceDir) # self.params.reportsDir=os.path.join(self.params.resultsDir,"reports") # ensureDir(self.params.reportsDir) indexRefFasta=self.params.referenceFasta+".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta,"reference fasta") checkFile(indexRefFasta,"reference fasta index") # read fasta index (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta) # determine subset of chroms where we can skip calling entirely (self.params.callRegionList, self.params.chromIsSkipped) = getCallRegions(self.params) self.paths = PathInfo(self.params) self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA)) self.params.isIgnoreAnomProperPair = (self.params.isRNA) # always use overlapping pairs for RNA calling if (self.params.isRNA) : self.params.useOverlapPairEvidence = True
def main() : import subprocess (options,args) = getOptions() checkDir(libexecDir) samtoolsBin = os.path.join(libexecDir,'samtools') checkFile(samtoolsBin,"samtools binary") chromData = {} chromList = [] if True : cmd = samtoolsBin + " idxstats " + options.bamFile proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) for line in proc.stdout : word = line.strip().split('\t') if word[0] == "*" : continue chromData[word[0]] = (int(word[1]),int(word[2])) chromList.append(word[0]) length = 0 count = 0 if True : import re, signal # match any cigar with a single match sequence: matchRex = re.compile("^([0-9]+)M$") cmd = samtoolsBin + " view -F 4 -s 0.1 " + options.bamFile proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) for line in proc.stdout : word = line.strip().split('\t',7) mr = matchRex.match(word[5]) if mr is None : continue length += int(mr.group(1)) count += 1 if count >= 200000 : break # done with subprocess: os.kill(proc.pid, signal.SIGINT) if count <= 100000 : raise Exception("Unexpected read length approximation results") outfp=sys.stdout avg_length = float(length)/float(count) for chrom in chromList : if chromData[chrom][0] < avg_length : continue depth = chromData[chrom][1]*avg_length / float(chromData[chrom][0]) outfp.write("%s\t%.3f\t%s\t%.3f\n" % (chrom, depth, count, avg_length))
def getOptions() : from optparse import OptionParser usage = "usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("--in", type="string",dest="inFiles",metavar="FILE", action="append", help="input depth filename, argument may be provided more than once to provide all input") parser.add_option("--out", type="string",dest="outFile",metavar="FILE", help="output depth filename (required)") (options,args) = parser.parse_args() if len(args) != 0 : parser.print_help() sys.exit(2) # validate input: if options.inFiles is None : parser.print_help() sys.exit(2) if options.outFile is None : parser.print_help() sys.exit(2) for inFile in options.inFiles : checkFile(inFile,"input depth") return (options,args)
def getOptions(): from optparse import OptionParser usage = "usage: %prog [options] > depth.txt" parser = OptionParser(usage=usage) parser.add_option("--bam", type="string", dest="bamFile", help="specify bam file for depth estimation (required)") (options, args) = parser.parse_args() if len(args) != 0: parser.print_help() sys.exit(2) # validate input: if options.bamFile is None: parser.print_help() sys.exit(2) checkFile(options.bamFile, "input bam") return (options, args)
def __init__(self, params, iniSections): cleanPyEnv() self.params = params self.iniSections = iniSections # Use RNA option for minCandidate size if self.params.isRNA: self.params.minCandidateVariantSize = self.params.rnaMinCandidateVariantSize # format bam lists: if self.params.normalBamList is None: self.params.normalBamList = [] if self.params.tumorBamList is None: self.params.tumorBamList = [] # make sure run directory is setup: self.params.runDir = os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir = os.path.join(self.params.runDir, "workspace") ensureDir(self.params.workDir) # all finalized pretty results get transfered to resultsDir self.params.resultsDir = os.path.join(self.params.runDir, "results") ensureDir(self.params.resultsDir) self.params.statsDir = os.path.join(self.params.resultsDir, "stats") ensureDir(self.params.statsDir) self.params.variantsDir = os.path.join(self.params.resultsDir, "variants") ensureDir(self.params.variantsDir) self.params.evidenceDir = os.path.join(self.params.resultsDir, "evidence") ensureDir(self.params.evidenceDir) # self.params.reportsDir=os.path.join(self.params.resultsDir,"reports") # ensureDir(self.params.reportsDir) indexRefFasta = self.params.referenceFasta + ".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta, "reference fasta") checkFile(indexRefFasta, "reference fasta index") # read fasta index (self.params.chromOrder, self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta) # determine subset of chroms where we can skip calling entirely (self.params.callRegionList, self.params.chromIsSkipped) = getCallRegions(self.params) self.paths = PathInfo(self.params) self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA)) self.params.isIgnoreAnomProperPair = (self.params.isRNA)
def __init__(self,params,iniSections) : # clear out some potentially destabilizing env variables: clearList = [ "PYTHONPATH", "PYTHONHOME"] for key in clearList : if key in os.environ : del os.environ[key] self.params=params self.iniSections=iniSections # make sure run directory is setup: self.params.runDir=os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir=os.path.join(self.params.runDir,"workspace") ensureDir(self.params.workDir) # all finalized pretty results get transfered to resultsDir self.params.resultsDir=os.path.join(self.params.runDir,"results") ensureDir(self.params.resultsDir) self.params.statsDir=os.path.join(self.params.resultsDir,"stats") ensureDir(self.params.statsDir) self.params.variantsDir=os.path.join(self.params.resultsDir,"variants") ensureDir(self.params.variantsDir) # self.params.reportsDir=os.path.join(self.params.resultsDir,"reports") # ensureDir(self.params.reportsDir) indexRefFasta=self.params.referenceFasta+".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta,"reference fasta") checkFile(indexRefFasta,"reference fasta index") self.params.normalBamList = [] for bam in (self.params.normalBam,) : if bam is None : continue self.params.normalBamList.append(bam) self.params.tumorBamList = [] for bam in (self.params.tumorBam,) : if bam is None : continue self.params.tumorBamList.append(bam) # read fasta index (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta) # sanity check some parameter typing: self.params.binSize = int(self.params.binSize) self.params.nonlocalWorkBins = int(self.params.nonlocalWorkBins) self.paths = PathInfo(self.params)
def __init__(self,params,iniSections) : # clear out some potentially destabilizing env variables: clearList = [ "PYTHONPATH", "PYTHONHOME"] for key in clearList : if key in os.environ : del os.environ[key] self.params=params self.iniSections=iniSections # format bam lists: if self.params.normalBamList is None : self.params.normalBamList = [] if self.params.tumorBamList is None : self.params.tumorBamList = [] # make sure run directory is setup: self.params.runDir=os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir=os.path.join(self.params.runDir,"workspace") ensureDir(self.params.workDir) # all finalized pretty results get transfered to resultsDir self.params.resultsDir=os.path.join(self.params.runDir,"results") ensureDir(self.params.resultsDir) self.params.statsDir=os.path.join(self.params.resultsDir,"stats") ensureDir(self.params.statsDir) self.params.variantsDir=os.path.join(self.params.resultsDir,"variants") ensureDir(self.params.variantsDir) # self.params.reportsDir=os.path.join(self.params.resultsDir,"reports") # ensureDir(self.params.reportsDir) indexRefFasta=self.params.referenceFasta+".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta,"reference fasta") checkFile(indexRefFasta,"reference fasta index") # read fasta index (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta) # sanity check some parameter typing: MEGABASE = 1000000 self.params.scanSize = int(self.params.scanSizeMb) * MEGABASE self.params.nonlocalWorkBins = int(self.params.nonlocalWorkBins) self.paths = PathInfo(self.params) self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA)) self.params.isIgnoreAnomProperPair = (self.params.isRNA)
def __init__(self,params,iniSections) : cleanPyEnv() self.params=params self.iniSections=iniSections # format bam lists: if self.params.normalBamList is None : self.params.normalBamList = [] if self.params.tumorBamList is None : self.params.tumorBamList = [] # make sure run directory is setup: self.params.runDir=os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir=os.path.join(self.params.runDir,"workspace") ensureDir(self.params.workDir) # all finalized pretty results get transfered to resultsDir self.params.resultsDir=os.path.join(self.params.runDir,"results") ensureDir(self.params.resultsDir) self.params.statsDir=os.path.join(self.params.resultsDir,"stats") ensureDir(self.params.statsDir) self.params.variantsDir=os.path.join(self.params.resultsDir,"variants") ensureDir(self.params.variantsDir) # self.params.reportsDir=os.path.join(self.params.resultsDir,"reports") # ensureDir(self.params.reportsDir) indexRefFasta=self.params.referenceFasta+".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta,"reference fasta") checkFile(indexRefFasta,"reference fasta index") # read fasta index (self.params.chromOrder,self.params.chromSizes) = getFastaChromOrderSize(indexRefFasta) self.paths = PathInfo(self.params) self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA)) self.params.isIgnoreAnomProperPair = (self.params.isRNA)
def __init__(self, params, PathInfoType): cleanPyEnv() self.params = params # make sure run directory is setup: self.params.runDir = os.path.abspath(self.params.runDir) ensureDir(self.params.runDir) # everything that's not intended to be a final result should dump directories/files in workDir self.params.workDir = os.path.join(self.params.runDir, "workspace") ensureDir(self.params.workDir) # all finalized pretty results get transferred to resultsDir self.params.resultsDir = os.path.join(self.params.runDir, "results") ensureDir(self.params.resultsDir) self.params.variantsDir = os.path.join(self.params.resultsDir, "variants") ensureDir(self.params.variantsDir) # timings and other stats go into statsDir self.params.statsDir = os.path.join(self.params.resultsDir, "stats") ensureDir(self.params.statsDir) self.paths = PathInfoType(self.params) referenceFastaIndex = self.params.referenceFasta + ".fai" if self.params.referenceFasta is None: raise Exception("No reference fasta defined.") else: checkFile(self.params.referenceFasta, "reference fasta") checkFile(referenceFastaIndex, "reference fasta index") # read fasta index (self.params.chromOrder, self.params.chromSizes) = getFastaChromOrderSize(referenceFastaIndex) # determine subset of chroms where we can skip calling entirely self.params.chromIsSkipped = getChromIsSkipped(self) self.params.isHighDepthFilter = (not (self.params.isExome or self.params.isRNA))
def main() : import subprocess (options,args) = getOptions() checkDir(libexecDir) samtoolsBin = os.path.join(libexecDir,'samtools') checkFile(samtoolsBin,"samtools binary") chromData = {} chromList = [] if True : cmd = samtoolsBin + " idxstats '%s'" % (options.bamFile) proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) for line in proc.stdout : word = line.strip().split('\t') if word[0] == "*" : continue chromData[word[0]] = (int(word[1]),int(word[2])) chromList.append(word[0]) min_count = 100000 length = 0 count = 0 record_count = 0 # In a first pass, attempt to subsample the genome. If this turns out to be a tiny sample, # then go back and run without subsampling the bam # for type in ('subsample','all') : import re, signal length = 0 count = 0 record_count = 0 # match any cigar with a series of match sequences: matchRex = re.compile("([0-9]+)[M=X]") # use "-F 4" to filter out unmapped reads cmd = samtoolsBin + " view -F 4" # use "-s 0.1" to subsample the bam records to increaase sampled read diversity if type == 'subsample' : cmd += " -s 0.1" cmd += " '%s'" % (options.bamFile) proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) for line in proc.stdout : record_count += 1 word = line.strip().split('\t',7) isFound = False for mr in re.finditer(matchRex, word[5]) : length += int(mr.group(1)) isFound = True if not isFound : continue count += 1 if count >= (min_count*2) : break # done with subprocess: os.kill(proc.pid, signal.SIGINT) if count > min_count : break if count <= min_count : raise Exception("Unexpected read length approximation results. Observation count: " + str(count) + " Bam record count: " + str(record_count) ) outfp=sys.stdout avg_length = float(length)/float(count) for chrom in chromList : if chromData[chrom][0] < avg_length : continue depth = chromData[chrom][1]*avg_length / float(chromData[chrom][0]) outfp.write("%s\t%.3f\t%s\t%.3f\n" % (chrom, depth, count, avg_length))
def main() : import subprocess (options,args) = getOptions() checkDir(libexecDir) samtoolsBin = os.path.join(libexecDir,'samtools') checkFile(samtoolsBin,"samtools binary") chromData = {} chromList = [] if True : cmd = samtoolsBin + " idxstats '%s'" % (options.bamFile) proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) for line in proc.stdout : word = line.strip().split('\t') if word[0] == "*" : continue chromData[word[0]] = (int(word[1]),int(word[2])) chromList.append(word[0]) min_count = 100000 length = 0 count = 0 record_count = 0 # In a first pass, attempt to subsample the genome. If this turns out to be a tiny sample, # then go back and run without subsampling the bam # for type in ('subsample','all') : import re, signal length = 0 count = 0 record_count = 0 # match any cigar with a series of match sequences: matchRex = re.compile("([0-9]+)[M=X]") # use "-F 4" to filter out unmapped reads cmd = samtoolsBin + " view -F 4" # use "-s 0.1" to subsample the bam records to increaase sampled read diversity if type == 'subsample' : cmd += " -s 0.1" cmd += " '%s'" % (options.bamFile) proc = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE) for line in proc.stdout : record_count += 1 word = line.strip().split('\t',7) isFound = False for mr in re.finditer(matchRex, word[5]) : length += int(mr.group(1)) isFound = True if not isFound : continue count += 1 if count >= (min_count*2) : break # done with subprocess: os.kill(proc.pid, signal.SIGINT) if count > min_count : break #if count <= min_count : # raise Exception("Unexpected read length approximation results. Observation count: " + str(count) + " Bam record count: " + str(record_count) ) outfp=sys.stdout avg_length = 0 if count != 0 : avg_length = float(length)/float(count) for chrom in chromList : depth = 0 if (chromData[chrom][0] >= avg_length) and (chromData[chrom][0] != 0) : depth = chromData[chrom][1]*avg_length / float(chromData[chrom][0]) outfp.write("%s\t%.3f\t%s\t%.3f\n" % (chrom, depth, count, avg_length))