def validateAndSanitizeExistingOptions(self, options): options.runDir = os.path.abspath(options.runDir) # check alignerMode: if options.alignerMode is not None: options.alignerMode = options.alignerMode.lower() if options.alignerMode not in self.validAlignerModes: raise OptParseException("Invalid aligner mode: '%s'" % options.alignerMode) options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference") # check for reference fasta index file: if options.referenceFasta is not None: faiFile = options.referenceFasta + ".fai" if not os.path.isfile(faiFile): raise OptParseException( "Can't find expected fasta index file: '%s'" % (faiFile)) # check for bed file of call regions and its index file if options.callRegionsBed is not None: options.callRegionsBed = os.path.abspath(options.callRegionsBed) checkTabixIndexedFile(options.callRegionsBed, "call-regions bed") if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(r) for r in options.regionStrList ]
def validateAndSanitizeOptions(self, options): assertOptionExists(options.runDir, "run directory") options.runDir = os.path.abspath(options.runDir) workflowScriptPath = os.path.join(options.runDir, options.workflowScriptName) if os.path.exists(workflowScriptPath): raise OptParseException( "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory." % (workflowScriptPath)) # check reference fasta file exists assertOptionExists(options.referenceFasta, "reference fasta file") options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference") # check for reference fasta index file: faiFile = options.referenceFasta + ".fai" if not os.path.isfile(faiFile): raise OptParseException( "Can't find expected fasta index file: '%s'" % (faiFile)) # check for bed file of call regions and its index file options.callRegionsBed = checkFixTabixIndexedFileOption( options.callRegionsBed, "call-regions bed") if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(r) for r in options.regionStrList ] # validate chromosome names appearing in region tags and callRegions bed file if (options.callRegionsBed is not None) or (options.genomeRegionList is not None): refChromInfo = getFastaInfo(options.referenceFasta) if options.callRegionsBed is not None: for chrom in getTabixChromSet(options.tabixBin, options.callRegionsBed): if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', in call regions bed file '%s', not found in reference genome." % (chrom, options.callRegionsBed)) if options.genomeRegionList is not None: for (genomeRegionIndex, genomeRegion) in enumerate(options.genomeRegionList): chrom = genomeRegion["chrom"] if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', parsed from region argument '%s', not found in reference genome." % (chrom, options.regionStrList[genomeRegionIndex]))
def validateAndSanitizeExistingOptions(self, options): options.runDir = os.path.abspath(options.runDir) options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference") # check for reference fasta index file: if options.referenceFasta is not None: faiFile = options.referenceFasta + ".fai" if not os.path.isfile(faiFile): raise OptParseException( "Can't find expected fasta index file: '%s'" % (faiFile)) checkFixTabixListOption(options.indelCandidatesList, "candidate indel vcf") checkFixTabixListOption(options.forcedGTList, "forced genotype vcf") if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(rr) for r in options.regionStrList for rr in r.split("+") ]
def validateAndSanitizeOptions(self, options): assertOptionExists(options.runDir, "run directory") options.runDir = os.path.abspath(options.runDir) assertOptionExists(options.referenceFasta, "reference fasta file") options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference fasta file") # check for reference fasta index file: referenceFastaIndex = options.referenceFasta + ".fai" if not os.path.isfile(referenceFastaIndex): raise OptParseException( "Can't find expected fasta index file: '%s'" % (referenceFastaIndex)) if options.isEstimateSequenceError: # Determine if dynamic error estimation is feasible based on the reference size # - Given reference contig set (S) with sequence length of at least 5 Mb # - The total sequence length from S must be at least 50 Mb class Constants: Megabase = 1000000 minChromSize = options.errorEstimationMinChromMb * Megabase minTotalSize = options.errorEstimationMinTotalMb * Megabase # read fasta index (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex) totalEstimationSize = 0 for chromSize in chromSizes.values(): if chromSize < Constants.minChromSize: continue totalEstimationSize += chromSize if totalEstimationSize < Constants.minTotalSize: sys.stderr.write( "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n" ) options.isEstimateSequenceError = False checkFixTabixListOption(options.indelCandidatesList, "candidate indel vcf") checkFixTabixListOption(options.forcedGTList, "forced genotype vcf") options.callRegionsBed = checkFixTabixIndexedFileOption( options.callRegionsBed, "call-regions bed") if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(rr) for r in options.regionStrList for rr in r.split("+") ] options.snvScoringModelFile = validateFixExistingFileArg( options.snvScoringModelFile, "SNV empirical scoring model file") options.indelScoringModelFile = validateFixExistingFileArg( options.indelScoringModelFile, "Indel empirical scoring model file")
def singleAppender(bamList, label): if bamList is None: return if len(bamList) > 1: raise OptParseException( "More than one %s sample BAM/CRAM files specified" % (label)) bamSetChecker.appendBams(bamList, label)
def validateOptionExistence(self, options): StrelkaSharedWorkflowOptionsBase.validateOptionExistence(self, options) if len(options.probandBamList) != 1: raise OptParseException( "Must specify one proband sample BAM/CRAM file") if len(options.parentBamList) != 2: raise OptParseException( "Must specify two parent sample BAM/CRAM files") bcheck = BamSetChecker() bcheck.appendBams(options.probandBamList, "proband") bcheck.appendBams(options.parentBamList, "parent") bcheck.appendBams(options.siblingBamList, "sibling", isAllowEmpty=True) bcheck.check(options.htsfileBin, options.referenceFasta)
def validateAndSanitizeOptions(self, options): MantaWorkflowOptionsBase.validateAndSanitizeOptions(self, options) def safeLen(x): if x is None: return 0 return len(x) if ((safeLen(options.normalBamList) == 0) and (safeLen(options.tumorBamList) == 0)): raise OptParseException( "No normal or tumor sample alignment files specified") if (safeLen(options.tumorBamList) > 1): raise OptParseException("Can't accept more then one tumor sample") if ((safeLen(options.tumorBamList) > 0) and (safeLen(options.normalBamList) > 1)): raise OptParseException( "Can't accept multiple normal samples for tumor subtraction") if options.isRNA: if ((safeLen(options.normalBamList) != 1) or (safeLen(options.tumorBamList) != 0)): raise OptParseException( "RNA mode currently requires exactly one normal sample") else: if options.isUnstrandedRNA: raise OptParseException( "Unstranded only applied for RNA inputs") if options.existingAlignStatsFile is not None: options.existingAlignStatsFile = validateFixExistingFileArg( options.existingAlignStatsFile, "existing align stats") groomBamList(options.normalBamList, "normal sample") groomBamList(options.tumorBamList, "tumor sample") bamSetChecker = BamSetChecker() if safeLen(options.normalBamList) > 0: bamSetChecker.appendBams(options.normalBamList, "Normal") if safeLen(options.tumorBamList) > 0: bamSetChecker.appendBams(options.tumorBamList, "Tumor") bamSetChecker.check(options.htsfileBin, options.referenceFasta)
def validateAndSanitizeExistingOptions(self, options): def checkForBamIndex(bamFile): baiFile = bamFile + ".bai" if not os.path.isfile(baiFile): raise OptParseException( "Can't find expected BAM index file: '%s'" % (baiFile)) def groomBamList(bamList, sampleLabel): if bamList is None: return for (index, bamFile) in enumerate(bamList): bamList[index] = validateFixExistingFileArg( bamFile, "%s BAM file" % (sampleLabel)) checkForBamIndex(bamList[index]) groomBamList(options.normalBamList, "normal sample") groomBamList(options.tumorBamList, "tumor sample") # check alignerMode: if options.alignerMode is not None: options.alignerMode = options.alignerMode.lower() if options.alignerMode not in self.validAlignerModes: raise OptParseException("Invalid aligner mode: '%s'" % options.alignerMode) options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference") # check for reference fasta index file: if options.referenceFasta is not None: faiFile = options.referenceFasta + ".fai" if not os.path.isfile(faiFile): raise OptParseException( "Can't find expected fasta index file: '%s'" % (faiFile)) if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(r) for r in options.regionStrList ] MantaWorkflowOptionsBase.validateAndSanitizeExistingOptions( self, options)
def validateOptionExistence(self,options) : if (((options.normalBamList is None) or (len(options.normalBamList) == 0)) and ((options.tumorBamList is None) or (len(options.tumorBamList) == 0))) : raise OptParseException("No normal & tumor sample BAM files specified") bcheck = BamSetChecker() bcheck.appendBams(options.normalBamList,"Normal") bcheck.appendBams(options.tumorBamList,"Tumor") bcheck.check(options.samtoolsBin, options.referenceFasta) MantaWorkflowOptionsBase.validateOptionExistence(self,options)
def validateOptionExistence(self, options): if (options.normalBamList is None) or (len(options.normalBamList) == 0): raise OptParseException("No normal sample BAM files specified") assertOptionExists(options.alignerMode, "aligner mode") assertOptionExists(options.referenceFasta, "reference fasta file") MantaWorkflowOptionsBase.validateOptionExistence(self, options) # check that the reference and all bams are using the same # set of chromosomes: bamList = [] bamLabels = [] def appendBams(inputBamList, inputLabel): if inputBamList is None: return for inputBamFile in inputBamList: bamList.append(inputBamFile) bamLabels.append(inputLabel) appendBams(options.normalBamList, "Normal") appendBams(options.tumorBamList, "Tumor") checkChromSet(options.samtoolsBin, options.referenceFasta, bamList, bamLabels, isReferenceLocked=True) # check for repeated bam entries: # bamSet = set() for bamFile in bamList: if bamFile in bamSet: raise OptParseException("Repeated input BAM file: %s" % (bamFile)) bamSet.add(bamFile)
def validateOptionExistence(self, options): def safeLen(x): if x is None: return 0 return len(x) if ((safeLen(options.normalBamList) == 0) and (safeLen(options.tumorBamList) == 0)): raise OptParseException( "No normal or tumor sample alignment files specified") if (safeLen(options.tumorBamList) > 1): raise OptParseException("Can't accept more then one tumor sample") if ((safeLen(options.tumorBamList) > 0) and (safeLen(options.normalBamList) > 1)): raise OptParseException( "Can't accept multiple normal samples for tumor subtraction") bcheck = BamSetChecker() bcheck.appendBams(options.normalBamList, "Normal") bcheck.appendBams(options.tumorBamList, "Tumor") bcheck.check(options.htsfileBin, options.referenceFasta) MantaWorkflowOptionsBase.validateOptionExistence(self, options)
def validateAndSanitizeOptions(self, options): StrelkaSharedWorkflowOptionsBase.validateAndSanitizeOptions( self, options) options.ploidyFilename = checkFixTabixIndexedFileOption( options.ploidyFilename, "ploidy file") options.noCompressBed = checkFixTabixIndexedFileOption( options.noCompressBed, "no-compress bed") if options.snvScoringModelFile is None: if options.isRNA: options.snvScoringModelFile = options.rnaSnvScoringModelFile else: options.snvScoringModelFile = options.germlineSnvScoringModelFile if options.indelScoringModelFile is None: if options.isRNA: options.indelScoringModelFile = options.rnaIndelScoringModelFile else: options.indelScoringModelFile = options.germlineIndelScoringModelFile # Disable dynamic error estimation for Exome if options.isExome: options.isEstimateSequenceError = False # Disable dynamic error estimation for RNA if options.isRNA: options.isEstimateSequenceError = False groomBamList(options.bamList, "input") def safeLen(x): if x is None: return 0 return len(x) if safeLen(options.bamList) == 0: raise OptParseException( "No input sample alignment files specified") bamSetChecker = BamSetChecker() bamSetChecker.appendBams(options.bamList, "Input") bamSetChecker.check(options.htsfileBin, options.referenceFasta)
def validateAndSanitizeOptions(self, options): assertOptionExists(options.runDir, "run directory") options.runDir = os.path.abspath(options.runDir) workflowScriptPath = os.path.join(options.runDir, options.workflowScriptName) if os.path.exists(workflowScriptPath): raise OptParseException( "Run directory already contains workflow script file '%s'. Each analysis must be configured in a separate directory." % (workflowScriptPath)) assertOptionExists(options.referenceFasta, "reference fasta file") options.referenceFasta = validateFixExistingFileArg( options.referenceFasta, "reference fasta file") # check for reference fasta index file: referenceFastaIndex = options.referenceFasta + ".fai" if not os.path.isfile(referenceFastaIndex): raise OptParseException( "Can't find expected fasta index file: '%s'" % (referenceFastaIndex)) if options.isEstimateSequenceError: # Determine if dynamic error estimation is feasible based on the reference size # - Given reference contig set (S) with sequence length of at least 5 Mb # - The total sequence length from S must be at least 50 Mb class Constants: Megabase = 1000000 minChromSize = options.errorEstimationMinChromMb * Megabase minTotalSize = options.errorEstimationMinTotalMb * Megabase # read fasta index (_, chromSizes) = getFastaChromOrderSize(referenceFastaIndex) totalEstimationSize = 0 for chromSize in chromSizes.values(): if chromSize < Constants.minChromSize: continue totalEstimationSize += chromSize if totalEstimationSize < Constants.minTotalSize: sys.stderr.write( "WARNING: Cannot estimate sequence errors from data due to small or overly fragmented reference sequence. Sequence error estimation disabled.\n" ) options.isEstimateSequenceError = False checkFixTabixListOption(options.indelCandidatesList, "candidate indel vcf") checkFixTabixListOption(options.forcedGTList, "forced genotype vcf") options.callRegionsBed = checkFixTabixIndexedFileOption( options.callRegionsBed, "call-regions bed") def extendedRegionStrList(): """ A generator on the regionStrList which parses the (intentionally undocumented/possibly deprecated) '+' entry format to specify multiple regions in a single argument. """ for r in options.regionStrList: for rr in r.split("+"): yield rr if (options.regionStrList is None) or (len(options.regionStrList) == 0): options.genomeRegionList = None else: options.genomeRegionList = [ parseGenomeRegion(r) for r in extendedRegionStrList() ] # validate chromosome names appearing in region tags and callRegions bed file if (options.callRegionsBed is not None) or (options.genomeRegionList is not None): refChromInfo = getFastaInfo(options.referenceFasta) if options.callRegionsBed is not None: for chrom in getTabixChromSet(options.tabixBin, options.callRegionsBed): if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', in call regions bed file '%s', not found in reference genome." % (chrom, options.callRegionsBed)) if options.genomeRegionList is not None: for (genomeRegionIndex, genomeRegion) in enumerate(options.genomeRegionList): chrom = genomeRegion["chrom"] if chrom not in refChromInfo: raise OptParseException( "Chromosome label '%s', parsed from region argument '%s', not found in reference genome." % (chrom, list( extendedRegionStrList())[genomeRegionIndex])) options.snvScoringModelFile = validateFixExistingFileArg( options.snvScoringModelFile, "SNV empirical scoring model file") options.indelScoringModelFile = validateFixExistingFileArg( options.indelScoringModelFile, "Indel empirical scoring model file")
def getRunOptions(self, primary_section, version=None, configHelp=None): """ primary client code interface to the finished product. do not override this method This returns a tuple of the (1) a class holding all of the primary run options gathered from the primary section of the ini file and command-line options and (2) an inifile hash-of-hashes reflecting all sections of the ini file. """ def updateIniSections(data, newData): for k in newData.keys(): if k not in data: data[k] = {} for kk in newData[k].keys(): data[k][kk] = newData[k][kk] # first level of options are those hard coded into the python code as defaults, # these have the lowest precedence: # iniSections = {primary_section: self.getOptionDefaults()} # next is the 'global' ini file, in the same directory as the configure # script: realArg0 = os.path.realpath(sys.argv[0]) cmdlineScriptName = os.path.basename(realArg0) configFileName = cmdlineScriptName + ".ini" cmdlineScriptDir = os.path.abspath(os.path.dirname(realArg0)) globalConfigPath = os.path.join(cmdlineScriptDir, configFileName) updateIniSections(iniSections, getIniSections(globalConfigPath)) parser = self._getOptionParser(iniSections[primary_section], configFileName, cmdlineScriptDir, version=version, configHelp=configHelp) (options, args) = parser.parse_args() try: if options.userConfigPath: if not os.path.isfile(options.userConfigPath): raise OptParseException("Can't find config file: '%s'" % (options.userConfigPath)) updateIniSections(iniSections, getIniSections(options.userConfigPath)) # reparse with updated default values: parser = self._getOptionParser(iniSections[primary_section], configFileName, cmdlineScriptDir, version=version, configHelp=configHelp) (options, args) = parser.parse_args() else: if not os.path.isfile(globalConfigPath): raise OptParseException( "Can't find default config file: '%s'" % (globalConfigPath)) if options.isAllHelp: # this second call to getOptionParser is only here to provide the extended help option: parser = self._getOptionParser(iniSections[primary_section], configFileName, cmdlineScriptDir, True, version=version, configHelp=configHelp) parser.print_help() sys.exit(2) nargs = len(args) if nargs: plural = "" if nargs > 1: plural = "s" raise OptParseException("%i unrecognized argument%s:\n%s" % (nargs, plural, "\n".join( ["'" + arg + "'" for arg in args]))) self.validateAndSanitizeOptions(options) # write options object back into full iniSections object: # for k, v in vars(options).iteritems(): if k == "isAllHelp": continue iniSections[primary_section][k] = v except OptParseException as e: noArgOrError(parser, str(e)) return options, iniSections
def checkRequired(bamList,label): if (bamList is None) or (len(bamList) == 0) : raise OptParseException("No %s sample BAM/CRAM files specified" % (label))
def checkForBamIndex(bamFile): baiFile = bamFile + ".bai" if not os.path.isfile(baiFile): raise OptParseException( "Can't find expected BAM index file: '%s'" % (baiFile))