def test_init(self): """Test TempFileManager all functions.""" t = TempFileManager() t.SetRootDir("/scratch") newFN = t.RegisterNewTmpFile() self.assertTrue(path.isfile(newFN)) existingDir = t.RegisterExistingTmpFile("/tmp", isDir=True) self.assertTrue(path.isdir(existingDir)) with self.assertRaises(IOError) as cm: t.RegisterExistingTmpFile("filethatdoesnotexist") newDN = t.RegisterNewTmpFile(isDir=True) self.assertTrue(path.isdir(newDN)) self.assertTrue(t._isRegistered(newDN)) newTxt = t.RegisterNewTmpFile(suffix=".txt") self.assertTrue(newTxt.endswith(".txt")) t.SetRootDir("~/tmp/") t.CleanUp() self.assertFalse(path.exists(newFN)) self.assertFalse(path.exists(newDN)) self.assertEqual(t.fileDB, []) self.assertEqual(t.dirDB, [])
def __init__(self, args=None, argumentList=(), output_dataset_type=AlignmentSet): """Initialize a PBAlignRunner object. argumentList is a list of arguments, such as: ['--debug', '--maxHits', '10', 'in.fasta', 'ref.fasta', 'out.sam'] """ desc = "Utilities for aligning PacBio reads to reference sequences." if args is None: # FIXME unit testing hack args = get_contract_parser().arg_parser.parser.parse_args( argumentList) self.args = args # args.verbosity is computed by counting # of 'v's in '-vv...'. # However in parseOptions, arguments are parsed twice to import config # options and then overwrite them with argumentList (e.g. command-line) # options. #self.args.verbosity = 1 if (self.args.verbosity is None) else \ # (int(self.args.verbosity) / 2 + 1) super(PBAlignRunner, self).__init__(desc) self._output_dataset_type = output_dataset_type self._alnService = None self._filterService = None self.fileNames = PBAlignFiles() self._tempFileManager = TempFileManager()
def __init__(self, options, fileNames, tempFileManager=None): """Initialize an AlignSerivce object. Need to resolve options specified within algorithmOptions; patch default options if not specified by the user inherit or initialize a tempory file manager Input: options : options parsed from (a list of arguments and a config file if --configFile is specified). fileNames : an object of PBAlignFiles tempFileManager: a temporary file manager. If it is None, create a new temporary file manager. """ self._options = options # Verify and assign input & output files. self._fileNames = fileNames self._fileNames.SetInOutFiles(self._options.inputFileName, self._options.referencePath, self._options.outputFileName, self._options.regionTable, self._options.pulseFile) # Resolve options specified within --algorithmOptions with # options parsed from the argument list (e.g. the command-line) # or a config file. self._options = self._resolveAlgorithmOptions(self._options, self._fileNames) # Patch PBalign default options if they havn't been specified yet. self._options = importDefaultOptions(self._options)[0] if tempFileManager is None: self._tempFileManager = TempFileManager(self._options.tmpDir) else: self._tempFileManager = tempFileManager self._tempFileManager.SetRootDir(self._options.tmpDir) # self.args is finalized. logging.debug("Parsed arguments considering configFile and " + "algorithmOptions: " + str(self._options))
class PBAlignRunner(PBToolRunner): """Tool runner.""" def __init__(self, args=None, argumentList=(), output_dataset_type=AlignmentSet): """Initialize a PBAlignRunner object. argumentList is a list of arguments, such as: ['--debug', '--maxHits', '10', 'in.fasta', 'ref.fasta', 'out.sam'] """ desc = "Utilities for aligning PacBio reads to reference sequences." if args is None: # FIXME unit testing hack args = get_contract_parser().arg_parser.parser.parse_args( argumentList) self.args = args # args.verbosity is computed by counting # of 'v's in '-vv...'. # However in parseOptions, arguments are parsed twice to import config # options and then overwrite them with argumentList (e.g. command-line) # options. #self.args.verbosity = 1 if (self.args.verbosity is None) else \ # (int(self.args.verbosity) / 2 + 1) super(PBAlignRunner, self).__init__(desc) self._output_dataset_type = output_dataset_type self._alnService = None self._filterService = None self.fileNames = PBAlignFiles() self._tempFileManager = TempFileManager() def _setupParsers(self, description): pass def _addStandardArguments(self): pass def getVersion(self): """Return version.""" return get_version() def _createAlignService(self, name, args, fileNames, tempFileManager): """ Create and return an AlignService by algorithm name. Input: name : an algorithm name such as blasr fileNames : an PBAlignFiles object args : pbalign options tempFileManager: a temporary file manager Output: an object of AlignService subclass (such as BlasrService). """ if name not in ALGORITHM_CANDIDATES: errMsg = "ERROR: unrecognized algorithm {algo}".format(algo=name) logging.error(errMsg) raise ValueError(errMsg) service = None if name == "blasr": service = BlasrService(args, fileNames, tempFileManager) elif name == "bowtie": service = BowtieService(args, fileNames, tempFileManager) elif name == "gmap": service = GMAPService(args, fileNames, tempFileManager) else: errMsg = "Service for {algo} is not implemented.".\ format(algo=name) logging.error(errMsg) raise ValueError(errMsg) service.checkAvailability() return service def _makeSane(self, args, fileNames): """ Check whether the input arguments make sense or not. """ errMsg = "" if args.useccs == "useccsdenovo": args.readType = "CCS" if fileNames.inputFileFormat == FILE_FORMATS.CCS: args.readType = "CCS" if args.forQuiver: logging.warning("Option --forQuiver has been deprecated in 3.0") outFormat = getFileFormat(fileNames.outputFileName) if outFormat == FILE_FORMATS.CMP: errMsg = "pbalign no longer supports CMP.H5 Output in 3.0." raise IOError(errMsg) if outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML: if args.algorithm != "blasr": errMsg = "Must choose blasr in order to output a bam file." raise ValueError(errMsg) if args.filterAdapterOnly: errMsg = "-filterAdapter does not work when out format is BAM." raise ValueError(errMsg) def _parseArgs(self): """Overwrite ToolRunner.parseArgs(self). Parse PBAlignRunner arguments considering both args in argumentList and args in a config file (specified by --configFile). """ pass def _output(self, inSam, refFile, outFile, readType=None): """Generate a SAM, BAM file. Input: inSam : an input SAM/BAM file. (e.g. fileName.filteredSam) refFile : the reference file. (e.g. fileName.targetFileName) outFile : the output SAM/BAM file (i.e. fileName.outputFileName) readType: standard or cDNA or CCS (can be None if not specified) Output: output, errCode, errMsg """ output, errCode, errMsg = "", 0, "" outFormat = getFileFormat(outFile) if outFormat == FILE_FORMATS.BAM: pass # Nothing to be done if outFormat == FILE_FORMATS.SAM: logging.info("OutputService: Genearte the output SAM file.") logging.debug("OutputService: Move %s as %s", inSam, outFile) try: shutil.move(real_ppath(inSam), real_ppath(outFile)) except shutil.Error as e: output, errCode, errMsg = "", 1, "Exited with error: " + str(e) logging.error(errMsg) raise RuntimeError(errMsg) elif outFormat == FILE_FORMATS.CMP: errMsg = "pbalign no longer supports CMP.H5 Output in 3.0." logging.error(errMsg) raise IOError(errMsg) elif outFormat == FILE_FORMATS.XML: logging.info("OutputService: Generating the output XML file %s %s", inSam, outFile) # Create {out}.xml, given {out}.bam outBam = str(outFile[0:-3]) + "bam" aln = None # FIXME This should really be more automatic if readType == "CCS": self._output_dataset_type = ConsensusAlignmentSet aln = self._output_dataset_type(real_ppath(outBam)) for res in aln.externalResources: res.reference = refFile aln.write(outFile) return output, errCode, errMsg def _cleanUp(self, realDelete=False): """ Clean up temporary files and intermediate results. """ logging.debug("Clean up temporary files and directories.") self._tempFileManager.CleanUp(realDelete) def run(self): """ The main function, it is called by PBToolRunner.start(). """ startTime = time.time() logging.info("pbalign version: %s", get_version()) #logging.debug("Original arguments: " + str(self._argumentList)) # Create an AlignService by algorithm name. self._alnService = self._createAlignService(self.args.algorithm, self.args, self.fileNames, self._tempFileManager) # Make sane. self._makeSane(self.args, self.fileNames) # Run align service. self._alnService.run() # Create a temporary filtered SAM/BAM file as output for FilterService. outFormat = getFileFormat(self.fileNames.outputFileName) suffix = ".bam" if outFormat in \ [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam" self.fileNames.filteredSam = self._tempFileManager.\ RegisterNewTmpFile(suffix=suffix) # Call filter service on SAM or BAM file. self._filterService = FilterService( self.fileNames.alignerSamOut, self.fileNames.targetFileName, self.fileNames.filteredSam, self.args.algorithm, #self._alnService.name, self._alnService.scoreSign, self.args, self.fileNames.adapterGffFileName) self._filterService.run() # Sort bam before output if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]: # Sort/make index for BAM output. BamPostService(self.fileNames).run() # Output all hits in SAM, BAM. self._output(inSam=self.fileNames.filteredSam, refFile=self.fileNames.targetFileName, outFile=self.fileNames.outputFileName, readType=self.args.readType) # Delete temporay files anyway to make self._cleanUp(False if (hasattr(self.args, "keepTmpFiles") and self.args.keepTmpFiles is True) else True) endTime = time.time() logging.info("Total time: {:.2f} s.".format(float(endTime - startTime))) return 0
class AlignService(Service): """Super class for all alignment services. AlignService takes argument options as input and generates a SAM file as output. Non-abstract subclasses should define the following properties. name : name of the subclass align service availability: availability of the subclass align service scoreSign : score sign of the subclass align service Subclasses should override the following virtual methods. _preProcess : _toCmd() _postProcesss() If --algorithmOptions needs to be supported by a subclass, override _resolveAlgorithmOptions(). """ @property def scoreSign(self): """Align service score sign can be -1 or 1. -1: negative scores are better than positive ones. 1: positive scores are better than negative ones. """ raise NotImplementedError( "Virtual property scoreSign() for AlignService must be " + "overwritten.") def _resolveAlgorithmOptions(self, options, fileNames): """A virtual method to resolve options specified within --algorithmOptions and options parsed from the command-line (including the config file). Input: options: options parsed from a command-line and a config file. fileNames: an PBAlignFiles object. Output: new options """ if options.algorithmOptions is None or options.algorithmOptions == "": return copy(options) raise NotImplementedError( "_resolveAlgorithmOptions() method for AlignService must be " + "overridden if --algorithmOptions is specified.") def __init__(self, options, fileNames, tempFileManager=None): """Initialize an AlignSerivce object. Need to resolve options specified within algorithmOptions; patch default options if not specified by the user inherit or initialize a tempory file manager Input: options : options parsed from (a list of arguments and a config file if --configFile is specified). fileNames : an object of PBAlignFiles tempFileManager: a temporary file manager. If it is None, create a new temporary file manager. """ self._options = options # Verify and assign input & output files. self._fileNames = fileNames self._fileNames.SetInOutFiles(self._options.inputFileName, self._options.referencePath, self._options.outputFileName, self._options.regionTable, self._options.pulseFile) # Resolve options specified within --algorithmOptions with # options parsed from the argument list (e.g. the command-line) # or a config file. self._options = self._resolveAlgorithmOptions(self._options, self._fileNames) # Patch PBalign default options if they havn't been specified yet. self._options = importDefaultOptions(self._options)[0] if tempFileManager is None: self._tempFileManager = TempFileManager(self._options.tmpDir) else: self._tempFileManager = tempFileManager self._tempFileManager.SetRootDir(self._options.tmpDir) # self.args is finalized. logging.debug("Parsed arguments considering configFile and " + "algorithmOptions: " + str(self._options)) @property def cmd(self): """String of a command line to align reads.""" return self._toCmd(self._options, self._fileNames, self._tempFileManager) def _toCmd(self, options, fileNames, tempFileManager): """A virtual method to generate a command line string. Generate a command line of the aligner to use in bash based on options and PBAlignFiles. Input: options : arguments parsed from the command-line, the config file and --algorithmOptions. fileNames: an PBAlignFiles object. tempFileManager: temporary file manager. Output: a command-line string which can be used in bash. """ raise NotImplementedError( "_toCmd() method for AlignService must be overridden") def _preProcess(self, inputFileName, referenceFile, regionTable, noSplitSubreads, tempFileManager, isWithinRepository): """A virtual method to prepare inputs for the aligner. Input: inputFileName : a PacBio BASE/PULSE/FOFN file. referenceFile : a FASTA reference file. regionTable : a region table RGN.H5/FOFN file. noSplitSubreads: whether to split subreads or not. tempFileManager: temporary file manager. isWithinRepository: whether or not the reference is within a refererence repository. Output: String, a FASTA file which can be used by the aligner. """ raise NotImplementedError( "_preProcess() method for AlignService must be overridden") def _postProcess(self): """A virtual method to post process the generated output file. """ raise NotImplementedError( "_postProcess() method for AlignService must be overridden") def run(self): """AlignService starts to run. """ logging.info(self.name + ": Align reads to references using " + "{prog}.".format(prog=self.progName)) # Prepare inputs for the aligner. self._fileNames.queryFileName = self._preProcess( self._fileNames.inputFileName, self._fileNames.targetFileName, self._fileNames.regionTable, self._options.noSplitSubreads, self._tempFileManager, self._fileNames.isWithinRepository) outFormat = getFileFormat(self._fileNames.outputFileName) suffix = ".bam" if (outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML) else ".sam" self._fileNames.alignerSamOut = self._tempFileManager.\ RegisterNewTmpFile(suffix=suffix) # Generate and execute cmd. try: output, errCode, errMsg = self._execute() except RuntimeError as e: raise RuntimeError(str(e)) # Post process the results. self._postProcess() return output, errCode, errMsg
class PBAlignRunner(PBToolRunner): """Tool runner.""" def __init__(self, args=None, argumentList=(), output_dataset_type=AlignmentSet): """Initialize a PBAlignRunner object. argumentList is a list of arguments, such as: ['--debug', '--maxHits', '10', 'in.fasta', 'ref.fasta', 'out.sam'] """ desc = "Utilities for aligning PacBio reads to reference sequences." if args is None: # FIXME unit testing hack args = get_contract_parser().arg_parser.parser.parse_args( argumentList) self.args = args # args.verbosity is computed by counting # of 'v's in '-vv...'. # However in parseOptions, arguments are parsed twice to import config # options and then overwrite them with argumentList (e.g. command-line) # options. #self.args.verbosity = 1 if (self.args.verbosity is None) else \ # (int(self.args.verbosity) / 2 + 1) super(PBAlignRunner, self).__init__(desc) self._output_dataset_type = output_dataset_type self._alnService = None self._filterService = None self.fileNames = PBAlignFiles() self._tempFileManager = TempFileManager() def _setupParsers(self, description): pass def _addStandardArguments(self): pass def getVersion(self): """Return version.""" return get_version() def _createAlignService(self, name, args, fileNames, tempFileManager): """ Create and return an AlignService by algorithm name. Input: name : an algorithm name such as blasr fileNames : an PBAlignFiles object args : pbalign options tempFileManager: a temporary file manager Output: an object of AlignService subclass (such as BlasrService). """ if name not in ALGORITHM_CANDIDATES: errMsg = "ERROR: unrecognized algorithm {algo}".format(algo=name) logging.error(errMsg) raise ValueError(errMsg) service = None if name == "blasr": service = BlasrService(args, fileNames, tempFileManager) elif name == "bowtie": service = BowtieService(args, fileNames, tempFileManager) elif name == "gmap": service = GMAPService(args, fileNames, tempFileManager) else: errMsg = "Service for {algo} is not implemented.".\ format(algo=name) logging.error(errMsg) raise ValueError(errMsg) service.checkAvailability() return service def _makeSane(self, args, fileNames): """ Check whether the input arguments make sense or not. """ errMsg = "" if args.useccs == "useccsdenovo": args.readType = "CCS" if fileNames.inputFileFormat == FILE_FORMATS.CCS: args.readType = "CCS" if args.forQuiver: if args.useccs is not None: errMsg = "Options --forQuiver and --useccs should not " + \ "be used together, since Quiver is not designed to " + \ "polish ccs reads. if you want to align ccs reads" + \ "in cmp.h5 format with pulse QVs loaded, use " + \ "--loadQVs with --useccs instead." raise ValueError(errMsg) args.loadQVs = True outFormat = getFileFormat(fileNames.outputFileName) if args.loadQVs: if fileNames.pulseFileName is None: errMsg = "The input file has to be in bas/pls/ccs.h5 " + \ "format, or --pulseFile needs to be specified, " if outFormat != FILE_FORMATS.CMP: errMsg = "The output file has to be in cmp.h5 format, " if errMsg != "": errMsg += "in order to load pulse QVs." logging.error(errMsg) raise ValueError(errMsg) if outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML: if args.algorithm != "blasr": errMsg = "Must choose blasr in order to output a bam file." raise ValueError(errMsg) if args.filterAdapterOnly: errMsg = "-filterAdapter does not work when out format is BAM." raise ValueError(errMsg) def _parseArgs(self): """Overwrite ToolRunner.parseArgs(self). Parse PBAlignRunner arguments considering both args in argumentList and args in a config file (specified by --configFile). """ pass def _output(self, inSam, refFile, outFile, readType=None, smrtTitle=False): """Generate a SAM, BAM or a CMP.H5 file. Input: inSam : an input SAM/BAM file. (e.g. fileName.filteredSam) refFile : the reference file. (e.g. fileName.targetFileName) outFile : the output SAM/BAM or CMP.H5 file. (i.e. fileName.outputFileName) readType: standard or cDNA or CCS (can be None if not specified) Output: output, errCode, errMsg """ output, errCode, errMsg = "", 0, "" outFormat = getFileFormat(outFile) if outFormat == FILE_FORMATS.BAM: pass # Nothing to be done if outFormat == FILE_FORMATS.SAM: logging.info("OutputService: Genearte the output SAM file.") logging.debug("OutputService: Move {src} as {dst}".format( src=inSam, dst=outFile)) try: shutil.move(real_ppath(inSam), real_ppath(outFile)) except shutil.Error as e: output, errCode, errMsg = "", 1, str(e) elif outFormat == FILE_FORMATS.CMP: #`samtoh5 inSam outFile -readType readType logging.info("OutputService: Genearte the output CMP.H5 " + "file using samtoh5.") prog = "samtoh5" cmd = "samtoh5 {samFile} {refFile} {outFile}".format( samFile=inSam, refFile=refFile, outFile=outFile) if readType is not None: cmd += " -readType {0} ".format(readType) if smrtTitle: cmd += " -smrtTitle " # Execute the command line logging.debug("OutputService: Call \"{0}\"".format(cmd)) output, errCode, errMsg = backticks(cmd) elif outFormat == FILE_FORMATS.XML: logging.info( "OutputService: Generating the output XML file".format( samFile=inSam, outFile=outFile)) # Create {out}.xml, given {out}.bam outBam = str(outFile[0:-3]) + "bam" aln = None # FIXME This should really be more automatic if self.args.readType == "CCS": self._output_dataset_type = ConsensusAlignmentSet aln = self._output_dataset_type(real_ppath(outBam)) for res in aln.externalResources: res.reference = refFile aln.write(outFile) if errCode != 0: errMsg = prog + " returned a non-zero exit status." + errMsg logging.error(errMsg) raise RuntimeError(errMsg) return output, errCode, errMsg def _cleanUp(self, realDelete=False): """ Clean up temporary files and intermediate results. """ logging.debug("Clean up temporary files and directories.") self._tempFileManager.CleanUp(realDelete) def run(self): """ The main function, it is called by PBToolRunner.start(). """ startTime = time.time() logging.info( "pbalign version: {version}".format(version=get_version())) # FIXME #logging.debug("Original arguments: " + str(self._argumentList)) # Create an AlignService by algorithm name. self._alnService = self._createAlignService(self.args.algorithm, self.args, self.fileNames, self._tempFileManager) # Make sane. self._makeSane(self.args, self.fileNames) # Run align service. try: self._alnService.run() except RuntimeError: return 1 # Create a temporary filtered SAM/BAM file as output for FilterService. outFormat = getFileFormat(self.fileNames.outputFileName) suffix = ".bam" if outFormat in \ [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam" self.fileNames.filteredSam = self._tempFileManager.\ RegisterNewTmpFile(suffix=suffix) # Call filter service on SAM or BAM file. self._filterService = FilterService( self.fileNames.alignerSamOut, self.fileNames.targetFileName, self.fileNames.filteredSam, self.args.algorithm, #self._alnService.name, self._alnService.scoreSign, self.args, self.fileNames.adapterGffFileName) try: self._filterService.run() except RuntimeError: return 1 # Sort bam before output if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]: # Sort/make index for BAM output. try: BamPostService(self.fileNames).run() except RuntimeError: return 1 # Output all hits in SAM, BAM or CMP.H5. try: useSmrtTitle = False if (self.args.algorithm != "blasr" or self.fileNames.inputFileFormat == FILE_FORMATS.FASTA): useSmrtTitle = True self._output(inSam=self.fileNames.filteredSam, refFile=self.fileNames.targetFileName, outFile=self.fileNames.outputFileName, readType=self.args.readType, smrtTitle=useSmrtTitle) except RuntimeError: return 1 # Load QVs to cmp.h5 for Quiver if outFormat == FILE_FORMATS.CMP and \ self.args.forQuiver or self.args.loadQVs: # Call post service for quiver. try: ForQuiverService(self.fileNames, self.args).run() except RuntimeError: return 1 # Delete temporay files anyway to make self._cleanUp(False if (hasattr(self.args, "keepTmpFiles") and self.args.keepTmpFiles is True) else True) endTime = time.time() logging.info("Total time: {:.2f} s.".format(float(endTime - startTime))) return 0