def validateAndSanitizeOptions(self, options): assertOptionExists(options.runDir, "run directory") options.runDir = os.path.abspath(options.runDir) workflowScriptPath = os.path.join(options.runDir, options.workflowScriptName) if os.path.exists(workflowScriptPath): raise OptParseException( "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory." % (workflowScriptPath)) # check reference fasta file exists assertOptionExists(options.referenceFasta, "reference fasta file") options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference") # check for reference fasta index file: faiFile = options.referenceFasta + ".fai" if not os.path.isfile(faiFile): raise OptParseException( "Can't find expected fasta index file: '%s'" % (faiFile)) # check for bed file of call regions and its index file options.callRegionsBed = checkFixTabixIndexedFileOption( options.callRegionsBed, "call-regions bed") if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(r) for r in options.regionStrList ] # validate chromosome names appearing in region tags and callRegions bed file if (options.callRegionsBed is not None) or (options.genomeRegionList is not None): refChromInfo = getFastaInfo(options.referenceFasta) if options.callRegionsBed is not None: for chrom in getTabixChromSet(options.tabixBin, options.callRegionsBed): if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', in call regions bed file '%s', not found in reference genome." % (chrom, options.callRegionsBed)) if options.genomeRegionList is not None: for (genomeRegionIndex, genomeRegion) in enumerate(options.genomeRegionList): chrom = genomeRegion["chrom"] if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', parsed from region argument '%s', not found in reference genome." % (chrom, options.regionStrList[genomeRegionIndex]))
def getChromIsSkipped(self): """ Determine subset of chroms from chromOrder which are completely skipped over here "skipped" means that not a single base on the chrom is requested for calling or error estimation \return The set of chromLabels which are skipped """ chromIsSkipped = set() # return empty set when no region selections have been made: if ((self.params.genomeRegionList is None) and (self.params.callRegionsBed is None)): return chromIsSkipped def allChromosomes(): """ Return a set of all chromosomes from the reference/alignments in this analysis """ return set(self.params.chromOrder) # first check chromosome coverage of "regions" arguments if self.params.genomeRegionList is not None: chromIsSkipped = allChromosomes() for genomeRegion in self.params.genomeRegionList: if genomeRegion["chrom"] in chromIsSkipped: chromIsSkipped.remove(genomeRegion["chrom"]) # further refine coverage based on callRegions BED file if self.params.callRegionsBed is not None: callRegionsChroms = getTabixChromSet(self.params.tabixBin, self.params.callRegionsBed) chromsNotInCallRegions = allChromosomes() - callRegionsChroms # Skip the union of: # 1. chromosomes skipped already due to region arguments # 2. chromosomes skipped due to callRegions bed track chromIsSkipped = chromIsSkipped | chromsNotInCallRegions # if sequencing error estimation is turned on, make sure estimation targets are not skipped: if self.params.isEstimateSequenceError: class Constants: Megabase = 1000000 errorEstimationMinChromSize = self.params.errorEstimationMinChromMb * Megabase for chrom in self.params.chromSizes: if self.params.chromSizes[ chrom] < Constants.errorEstimationMinChromSize: continue if chrom in chromIsSkipped: chromIsSkipped.remove(chrom) return chromIsSkipped
def validateAndSanitizeOptions(self,options) : assertOptionExists(options.runDir,"run directory") options.runDir = os.path.abspath(options.runDir) workflowScriptPath = os.path.join(options.runDir, options.workflowScriptName) if os.path.exists(workflowScriptPath): raise OptParseException("Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory." % (workflowScriptPath)) # check reference fasta file exists assertOptionExists(options.referenceFasta,"reference fasta file") options.referenceFasta=validateFixExistingFileArg(options.referenceFasta,"reference") # check for reference fasta index file: faiFile=options.referenceFasta + ".fai" if not os.path.isfile(faiFile) : raise OptParseException("Can't find expected fasta index file: '%s'" % (faiFile)) # check for bed file of call regions and its index file options.callRegionsBed = checkFixTabixIndexedFileOption(options.callRegionsBed, "call-regions bed") if (options.regionStrList is None) or (len(options.regionStrList) == 0) : options.genomeRegionList = None else : options.genomeRegionList = [parseGenomeRegion(r) for r in options.regionStrList] # validate chromosome names appearing in region tags and callRegions bed file if (options.callRegionsBed is not None) or (options.genomeRegionList is not None) : refChromInfo = getFastaInfo(options.referenceFasta) if options.callRegionsBed is not None : for chrom in getTabixChromSet(options.tabixBin, options.callRegionsBed) : if chrom not in refChromInfo : raise OptParseException("Chromosome label '%s', in call regions bed file '%s', not found in reference genome." % (chrom, options.callRegionsBed)) if options.genomeRegionList is not None : for (genomeRegionIndex, genomeRegion) in enumerate(options.genomeRegionList) : chrom = genomeRegion["chrom"] if chrom not in refChromInfo : raise OptParseException("Chromosome label '%s', parsed from region argument '%s', not found in reference genome." % (chrom, options.regionStrList[genomeRegionIndex]))
def validateAndSanitizeOptions(self, options): assertOptionExists(options.runDir, "run directory") options.runDir = os.path.abspath(options.runDir) workflowScriptPath = os.path.join(options.runDir, options.workflowScriptName) if os.path.exists(workflowScriptPath): raise OptParseException( "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory." % (workflowScriptPath)) assertOptionExists(options.referenceFasta, "reference fasta file") options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference fasta file") # check for reference fasta index file: referenceFastaIndex = options.referenceFasta + ".fai" if not os.path.isfile(referenceFastaIndex): raise OptParseException( "Can't find expected fasta index file: '%s'" % (referenceFastaIndex)) if options.isEstimateSequenceError: # Determine if dynamic error estimation is feasible based on the reference size # - Given reference contig set (S) with sequence length of at least 5 Mb # - The total sequence length from S must be at least 50 Mb class Constants: Megabase = 1000000 minChromSize = options.errorEstimationMinChromMb * Megabase minTotalSize = options.errorEstimationMinTotalMb * Megabase # read fasta index (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex) totalEstimationSize = 0 for chromSize in chromSizes.values(): if chromSize < Constants.minChromSize: continue totalEstimationSize += chromSize if totalEstimationSize < Constants.minTotalSize: sys.stderr.write( "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n" ) options.isEstimateSequenceError = False checkFixTabixListOption(options.indelCandidatesList, "candidate indel vcf") checkFixTabixListOption(options.forcedGTList, "forced genotype vcf") options.callRegionsBed = checkFixTabixIndexedFileOption( options.callRegionsBed, "call-regions bed") def extendedRegionStrList(): """ A generator on the regionStrList which parses the (intentionally undocumented/possibly deprecated) '+' entry format to specify multiple regions in a single argument. """ for r in options.regionStrList: for rr in r.split("+"): yield rr if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(r) for r in extendedRegionStrList() ] # validate chromosome names appearing in region tags and callRegions bed file if (options.callRegionsBed is not None) or (options.genomeRegionList is not None): refChromInfo = getFastaInfo(options.referenceFasta) if options.callRegionsBed is not None: for chrom in getTabixChromSet(options.tabixBin, options.callRegionsBed): if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', in call regions bed file '%s', not found in reference genome." % (chrom, options.callRegionsBed)) if options.genomeRegionList is not None: for (genomeRegionIndex, genomeRegion) in enumerate(options.genomeRegionList): chrom = genomeRegion["chrom"] if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', parsed from region argument '%s', not found in reference genome." % (chrom, list( extendedRegionStrList())[genomeRegionIndex])) options.snvScoringModelFile = validateFixExistingFileArg( options.snvScoringModelFile, "SNV empirical scoring model file") options.indelScoringModelFile = validateFixExistingFileArg( options.indelScoringModelFile, "Indel empirical scoring model file")
def getCallRegions(params) : """ determine 1) a set of genomic regions for calling 2) a set of chromosomes that are completely skipped over, where "skipped" means that not a single base on the chrom is requested for calling \return a list of genomic regions for calling \return a set of chromLabels which are skipped """ callRegionList = [] chromIsSkipped = set() # when no region selections have been made: if ((params.genomeRegionList is None) and (params.callRegionsBed is None)) : return (callRegionList, chromIsSkipped) # check chromosome coverage of "regions" arguments chromIsSkipped = set(params.chromOrder) if params.genomeRegionList is not None : for genomeRegion in params.genomeRegionList : chrom = genomeRegion["chrom"] if chrom not in params.chromOrder: raise Exception("Unexpected chromosome '%s' in the argument of target regions (--region)" % (chrom)) if chrom in chromIsSkipped : chromIsSkipped.remove(chrom) if params.callRegionsBed is None : return (params.genomeRegionList, chromIsSkipped) # check chromsome coverage based on callRegions BED file callChromList = [] chromIsSkipped2 = set(params.chromOrder) for chrom in getTabixChromSet(params.tabixBin, params.callRegionsBed) : if chrom not in params.chromOrder: raise Exception("Unexpected chromosome '%s' in the bed file of call regions %s " % (chrom, params.callRegionsBed)) callChromList.append(chrom) if chrom in chromIsSkipped2 : chromIsSkipped2.remove(chrom) if params.genomeRegionList is None : chromIsSkipped = chromIsSkipped2 for chrom in callChromList: chromRegion = {"chrom":chrom, "start":1, "end":params.chromSizes[chrom]} callRegions = getOverlapCallRegions(params, chromRegion) callRegionList.extend(callRegions) else: chromIsSkipped = chromIsSkipped | chromIsSkipped2 for genomeRegion in params.genomeRegionList: chrom = genomeRegion['chrom'] if genomeRegion["start"] is None: genomeRegion["start"] = 1 if genomeRegion["end"] is None: genomeRegion["end"] = params.chromSizes[chrom] subCallRegions = getOverlapCallRegions(params, genomeRegion) callRegionList.extend(subCallRegions) return (callRegionList, chromIsSkipped)
def getCallRegions(params): """ determine 1) a set of genomic regions for calling 2) a set of chromosomes that are completely skipped over, where "skipped" means that not a single base on the chrom is requested for calling \return a list of genomic regions for calling \return a set of chromLabels which are skipped """ callRegionList = [] chromIsSkipped = set() # when no region selections have been made: if ((params.genomeRegionList is None) and (params.callRegionsBed is None)): return (callRegionList, chromIsSkipped) # check chromosome coverage of "regions" arguments chromIsSkipped = set(params.chromOrder) if params.genomeRegionList is not None: for genomeRegion in params.genomeRegionList: chrom = genomeRegion["chrom"] if chrom not in params.chromOrder: raise Exception( "Unexpected chromosome '%s' in the argument of target regions (--region)" % (chrom)) if chrom in chromIsSkipped: chromIsSkipped.remove(chrom) if params.callRegionsBed is None: return (params.genomeRegionList, chromIsSkipped) # check chromsome coverage based on callRegions BED file callChromList = [] chromIsSkipped2 = set(params.chromOrder) for chrom in getTabixChromSet(params.tabixBin, params.callRegionsBed): if chrom not in params.chromOrder: raise Exception( "Unexpected chromosome '%s' in the bed file of call regions %s " % (chrom, params.callRegionsBed)) callChromList.append(chrom) if chrom in chromIsSkipped2: chromIsSkipped2.remove(chrom) if params.genomeRegionList is None: chromIsSkipped = chromIsSkipped2 for chrom in callChromList: chromRegion = { "chrom": chrom, "start": 1, "end": params.chromSizes[chrom] } callRegions = getOverlapCallRegions(params, chromRegion) callRegionList.extend(callRegions) else: chromIsSkipped = chromIsSkipped | chromIsSkipped2 for genomeRegion in params.genomeRegionList: chrom = genomeRegion['chrom'] if genomeRegion["start"] is None: genomeRegion["start"] = 1 if genomeRegion["end"] is None: genomeRegion["end"] = params.chromSizes[chrom] subCallRegions = getOverlapCallRegions(params, genomeRegion) callRegionList.extend(subCallRegions) return (callRegionList, chromIsSkipped)