예제 #1
0
 def test_isValidOutputFormat(self):
     """Test isOutputFormat()."""
     self.assertFalse(isValidOutputFormat(getFileFormat("ab.fasta")))
     self.assertFalse(isValidOutputFormat(getFileFormat("ab.fa")))
     self.assertFalse(isValidOutputFormat(getFileFormat("ab.pls.h5")))
     self.assertFalse(isValidOutputFormat(getFileFormat("ab.plx.h5")))
     self.assertFalse(isValidOutputFormat(getFileFormat("ab.bas.h5")))
     self.assertFalse(isValidOutputFormat(getFileFormat("ab.bax.h5")))
     self.assertFalse(isValidOutputFormat(getFileFormat("ab.fofn")))
     self.assertTrue(isValidOutputFormat(getFileFormat("ab.sam")))
     self.assertTrue(isValidOutputFormat(getFileFormat("ab.cmp.h5")))
     self.assertFalse(isValidOutputFormat(getFileFormat("ab.xyz")))
예제 #2
0
 def test_isValidOutputFormat(self):
     """Test isOutputFormat()."""
     self.assertFalse(isValidOutputFormat( getFileFormat("ab.fasta")) )
     self.assertFalse(isValidOutputFormat( getFileFormat("ab.fa")) )
     self.assertFalse(isValidOutputFormat( getFileFormat("ab.pls.h5")) )
     self.assertFalse(isValidOutputFormat( getFileFormat("ab.plx.h5")) )
     self.assertFalse(isValidOutputFormat( getFileFormat("ab.bas.h5")) )
     self.assertFalse(isValidOutputFormat( getFileFormat("ab.bax.h5")) )
     self.assertFalse(isValidOutputFormat( getFileFormat("ab.fofn")) )
     self.assertTrue(isValidOutputFormat( getFileFormat("ab.sam")) )
     self.assertTrue(isValidOutputFormat( getFileFormat("ab.cmp.h5")) )
     self.assertFalse(isValidOutputFormat( getFileFormat("ab.xyz")) )
예제 #3
0
    def run(self):
        """AlignService starts to run. """
        logging.info(self.name + ": Align reads to references using " +
                     "{prog}.".format(prog=self.progName))
        # Prepare inputs for the aligner.
        self._fileNames.queryFileName = self._preProcess(
            self._fileNames.inputFileName, self._fileNames.targetFileName,
            self._fileNames.regionTable, self._options.noSplitSubreads,
            self._tempFileManager, self._fileNames.isWithinRepository)

        outFormat = getFileFormat(self._fileNames.outputFileName)
        suffix = ".bam" if (outFormat == FILE_FORMATS.BAM
                            or outFormat == FILE_FORMATS.XML) else ".sam"
        self._fileNames.alignerSamOut = self._tempFileManager.\
            RegisterNewTmpFile(suffix=suffix)

        # Generate and execute cmd.
        try:
            output, errCode, errMsg = self._execute()
        except RuntimeError as e:
            raise RuntimeError(str(e))

        # Post process the results.
        self._postProcess()

        return output, errCode, errMsg
예제 #4
0
    def _makeSane(self, args, fileNames):
        """
        Check whether the input arguments make sense or not.
        """
        errMsg = ""
        if args.useccs == "useccsdenovo":
            args.readType = "CCS"

        if fileNames.inputFileFormat == FILE_FORMATS.CCS:
            args.readType = "CCS"

        if args.forQuiver:
            logging.warning("Option --forQuiver has been deprecated in 3.0")

        outFormat = getFileFormat(fileNames.outputFileName)

        if outFormat == FILE_FORMATS.CMP:
            errMsg = "pbalign no longer supports CMP.H5 Output in 3.0."
            raise IOError(errMsg)

        if outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML:
            if args.algorithm != "blasr":
                errMsg = "Must choose blasr in order to output a bam file."
                raise ValueError(errMsg)
            if args.filterAdapterOnly:
                errMsg = "-filterAdapter does not work when out format is BAM."
                raise ValueError(errMsg)
예제 #5
0
    def _makeSane(self, args, fileNames):
        """
        Check whether the input arguments make sense or not.
        """
        errMsg = ""
        if args.useccs == "useccsdenovo":
            args.readType = "CCS"

        if fileNames.inputFileFormat == FILE_FORMATS.CCS:
            args.readType = "CCS"

        if args.forQuiver:
            if args.useccs is not None:
                errMsg = "Options --forQuiver and --useccs should not " + \
                         "be used together, since Quiver is not designed to " + \
                         "polish ccs reads. if you want to align ccs reads" + \
                         "in cmp.h5 format with pulse QVs loaded, use " + \
                         "--loadQVs with --useccs instead."
                raise ValueError(errMsg)
            args.loadQVs = True

        if args.loadQVs:
            if fileNames.pulseFileName is None:
                errMsg = "The input file has to be in bas/pls/ccs.h5 " + \
                         "format, or --pulseFile needs to be specified, "
            if getFileFormat(fileNames.outputFileName) != FILE_FORMATS.CMP:
                errMsg = "The output file has to be in cmp.h5 format, "
            if errMsg != "":
                errMsg += "in order to load pulse QVs."
                logging.error(errMsg)
                raise ValueError(errMsg)
예제 #6
0
파일: align.py 프로젝트: lpp1985/lpp_Script
    def run(self):
        """AlignService starts to run. """
        logging.info(self.name + ": Align reads to references using " +
                     "{prog}.".format(prog=self.progName))
        # Prepare inputs for the aligner.
        self._fileNames.queryFileName = self._preProcess(
            self._fileNames.inputFileName,
            self._fileNames.targetFileName,
            self._fileNames.regionTable,
            self._options.noSplitSubreads,
            self._tempFileManager,
            self._fileNames.isWithinRepository)

        outFormat = getFileFormat(self._fileNames.outputFileName)
        suffix = ".bam" if (outFormat == FILE_FORMATS.BAM or
                            outFormat == FILE_FORMATS.XML) else ".sam"
        self._fileNames.alignerSamOut = self._tempFileManager.\
            RegisterNewTmpFile(suffix=suffix)

        # Generate and execute cmd.
        try:
            output, errCode, errMsg = self._execute()
        except RuntimeError as e:
            raise RuntimeError(str(e))

        # Post process the results.
        self._postProcess()

        return output, errCode, errMsg
예제 #7
0
    def _makeSane(self, args, fileNames):
        """
        Check whether the input arguments make sense or not.
        """
        errMsg = ""
        if args.useccs == "useccsdenovo":
            args.readType = "CCS"

        if fileNames.inputFileFormat == FILE_FORMATS.CCS:
            args.readType = "CCS"

        if args.forQuiver:
            logging.warning("Option --forQuiver has been deprecated in 3.0")

        outFormat = getFileFormat(fileNames.outputFileName)

        if outFormat == FILE_FORMATS.CMP:
            errMsg = "pbalign no longer supports CMP.H5 Output in 3.0."
            raise IOError(errMsg)

        if outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML:
            if args.algorithm != "blasr":
                errMsg = "Must choose blasr in order to output a bam file."
                raise ValueError(errMsg)
            if args.filterAdapterOnly:
                errMsg = "-filterAdapter does not work when out format is BAM."
                raise ValueError(errMsg)
예제 #8
0
    def _output(self, inSam, refFile, outFile, readType=None, smrtTitle=False):
        """Generate a SAM, BAM or a CMP.H5 file.
        Input:
            inSam   : an input SAM/BAM file. (e.g. fileName.filteredSam)
            refFile : the reference file. (e.g. fileName.targetFileName)
            outFile : the output SAM/BAM or CMP.H5 file.
                      (i.e. fileName.outputFileName)
            readType: standard or cDNA or CCS (can be None if not specified)
        Output:
            output, errCode, errMsg
        """
        output, errCode, errMsg = "", 0, ""

        outFormat = getFileFormat(outFile)

        if outFormat == FILE_FORMATS.BAM:
            pass  # Nothing to be done
        if outFormat == FILE_FORMATS.SAM:
            logging.info("OutputService: Genearte the output SAM file.")
            logging.debug("OutputService: Move {src} as {dst}".format(
                src=inSam, dst=outFile))
            try:
                shutil.move(real_ppath(inSam), real_ppath(outFile))
            except shutil.Error as e:
                output, errCode, errMsg = "", 1, str(e)
        elif outFormat == FILE_FORMATS.CMP:
            #`samtoh5 inSam outFile -readType readType
            logging.info("OutputService: Genearte the output CMP.H5 " +
                         "file using samtoh5.")
            prog = "samtoh5"
            cmd = "samtoh5 {samFile} {refFile} {outFile}".format(
                samFile=inSam, refFile=refFile, outFile=outFile)
            if readType is not None:
                cmd += " -readType {0} ".format(readType)
            if smrtTitle:
                cmd += " -smrtTitle "
            # Execute the command line
            logging.debug("OutputService: Call \"{0}\"".format(cmd))
            output, errCode, errMsg = backticks(cmd)
        elif outFormat == FILE_FORMATS.XML:
            logging.info(
                "OutputService: Generating the output XML file".format(
                    samFile=inSam, outFile=outFile))
            # Create {out}.xml, given {out}.bam
            outBam = str(outFile[0:-3]) + "bam"
            aln = None
            # FIXME This should really be more automatic
            if self.args.readType == "CCS":
                self._output_dataset_type = ConsensusAlignmentSet
            aln = self._output_dataset_type(real_ppath(outBam))
            for res in aln.externalResources:
                res.reference = refFile
            aln.write(outFile)

        if errCode != 0:
            errMsg = prog + " returned a non-zero exit status." + errMsg
            logging.error(errMsg)
            raise RuntimeError(errMsg)
        return output, errCode, errMsg
예제 #9
0
    def _output(self, inSam, refFile, outFile, readType=None, smrtTitle=False):
        """Generate a SAM, BAM or a CMP.H5 file.
        Input:
            inSam   : an input SAM/BAM file. (e.g. fileName.filteredSam)
            refFile : the reference file. (e.g. fileName.targetFileName)
            outFile : the output SAM/BAM or CMP.H5 file.
                      (i.e. fileName.outputFileName)
            readType: standard or cDNA or CCS (can be None if not specified)
        Output:
            output, errCode, errMsg
        """
        output, errCode, errMsg = "", 0, ""

        outFormat = getFileFormat(outFile)

        if outFormat == FILE_FORMATS.BAM:
            pass # Nothing to be done
        if outFormat == FILE_FORMATS.SAM:
            logging.info("OutputService: Genearte the output SAM file.")
            logging.debug("OutputService: Move {src} as {dst}".format(
                src=inSam, dst=outFile))
            try:
                shutil.move(real_ppath(inSam), real_ppath(outFile))
            except shutil.Error as e:
                output, errCode, errMsg = "", 1, str(e)
        elif outFormat == FILE_FORMATS.CMP:
            #`samtoh5 inSam outFile -readType readType
            logging.info("OutputService: Genearte the output CMP.H5 " +
                         "file using samtoh5.")
            prog = "samtoh5"
            cmd = "samtoh5 {samFile} {refFile} {outFile}".format(
                samFile=inSam, refFile=refFile, outFile=outFile)
            if readType is not None:
                cmd += " -readType {0} ".format(readType)
            if smrtTitle:
                cmd += " -smrtTitle "
            # Execute the command line
            logging.debug("OutputService: Call \"{0}\"".format(cmd))
            output, errCode, errMsg = backticks(cmd)
        elif outFormat == FILE_FORMATS.XML:
            logging.info("OutputService: Generating the output XML file".
                         format(samFile=inSam, outFile=outFile))
            # Create {out}.xml, given {out}.bam
            outBam = str(outFile[0:-3]) + "bam"
            aln = None
            # FIXME This should really be more automatic
            if self.args.readType == "CCS":
                self._output_dataset_type = ConsensusAlignmentSet
            aln = self._output_dataset_type(real_ppath(outBam))
            for res in aln.externalResources:
                res.reference = refFile
            aln.write(outFile)

        if errCode != 0:
            errMsg = prog + " returned a non-zero exit status." + errMsg
            logging.error(errMsg)
            raise RuntimeError(errMsg)
        return output, errCode, errMsg
예제 #10
0
    def run(self):
        """
        The main function, it is called by PBToolRunner.start().
        """
        startTime = time.time()
        logging.info("pbalign version: %s", get_version())
        #logging.debug("Original arguments: " + str(self._argumentList))

        # Create an AlignService by algorithm name.
        self._alnService = self._createAlignService(self.args.algorithm,
                                                    self.args, self.fileNames,
                                                    self._tempFileManager)

        # Make sane.
        self._makeSane(self.args, self.fileNames)

        # Run align service.
        self._alnService.run()

        # Create a temporary filtered SAM/BAM file as output for FilterService.
        outFormat = getFileFormat(self.fileNames.outputFileName)
        suffix = ".bam" if outFormat in \
                [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam"
        self.fileNames.filteredSam = self._tempFileManager.\
            RegisterNewTmpFile(suffix=suffix)

        # Call filter service on SAM or BAM file.
        self._filterService = FilterService(
            self.fileNames.alignerSamOut,
            self.fileNames.targetFileName,
            self.fileNames.filteredSam,
            self.args.algorithm,
            #self._alnService.name,
            self._alnService.scoreSign,
            self.args,
            self.fileNames.adapterGffFileName)
        self._filterService.run()

        # Sort bam before output
        if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]:
            # Sort/make index for BAM output.
            BamPostService(filenames=self.fileNames,
                           nproc=self.args.nproc).run()

        # Output all hits in SAM, BAM.
        self._output(inSam=self.fileNames.filteredSam,
                     refFile=self.fileNames.targetFileName,
                     outFile=self.fileNames.outputFileName,
                     readType=self.args.readType)

        # Delete temporay files anyway to make
        self._cleanUp(False if (hasattr(self.args, "keepTmpFiles")
                                and self.args.keepTmpFiles is True) else True)

        endTime = time.time()
        logging.info("Total time: {:.2f} s.".format(float(endTime -
                                                          startTime)))
        return 0
예제 #11
0
    def run(self):
        """
        The main function, it is called by PBToolRunner.start().
        """
        startTime = time.time()
        logging.info("pbalign version: %s", get_version())
        #logging.debug("Original arguments: " + str(self._argumentList))

        # Create an AlignService by algorithm name.
        self._alnService = self._createAlignService(self.args.algorithm,
                                                    self.args,
                                                    self.fileNames,
                                                    self._tempFileManager)

        # Make sane.
        self._makeSane(self.args, self.fileNames)

        # Run align service.
        self._alnService.run()

        # Create a temporary filtered SAM/BAM file as output for FilterService.
        outFormat = getFileFormat(self.fileNames.outputFileName)
        suffix = ".bam" if outFormat in \
                [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam"
        self.fileNames.filteredSam = self._tempFileManager.\
            RegisterNewTmpFile(suffix=suffix)

        # Call filter service on SAM or BAM file.
        self._filterService = FilterService(self.fileNames.alignerSamOut,
                                            self.fileNames.targetFileName,
                                            self.fileNames.filteredSam,
                                            self.args.algorithm,
                                            #self._alnService.name,
                                            self._alnService.scoreSign,
                                            self.args,
                                            self.fileNames.adapterGffFileName)
        self._filterService.run()

        # Sort bam before output
        if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]:
            # Sort/make index for BAM output.
            BamPostService(self.fileNames).run()

        # Output all hits in SAM, BAM.
        self._output(
            inSam=self.fileNames.filteredSam,
            refFile=self.fileNames.targetFileName,
            outFile=self.fileNames.outputFileName,
            readType=self.args.readType)

        # Delete temporay files anyway to make
        self._cleanUp(False if (hasattr(self.args, "keepTmpFiles") and
                                self.args.keepTmpFiles is True) else True)

        endTime = time.time()
        logging.info("Total time: {:.2f} s.".format(float(endTime - startTime)))
        return 0
예제 #12
0
    def _output(self, inSam, refFile, outFile, readType=None, smrtTitle=False):
        """Generate a sam or a cmp.h5 file.
        Input:
            inSam   : an input SAM file. (e.g. fileName.filteredSam)
            refFile : the reference file. (e.g. fileName.targetFileName)
            outFile : the output SAM or CMP.H5 file.
                      (i.e. fileName.outputFileName)
            readType: standard or cDNA or CCS (can be None if not specified)
        Output:
            output, errCode, errMsg
        """
        output, errCode, errMsg = "", 0, ""

        if getFileFormat(outFile) == FILE_FORMATS.SAM:
            #`mv inSam outFile`
            logging.info("OutputService: Genearte the output SAM file.")
            logging.debug("OutputService: Move {src} as {dst}".format(
                src=inSam, dst=outFile))
            try:
                shutil.move(real_ppath(inSam), real_ppath(outFile))
            except shutil.Error as e:
                output, errCode, errMsg = "", 1, str(e)
        elif getFileFormat(outFile) == FILE_FORMATS.CMP:
            #`samtoh5 inSam outFile -readType readType
            logging.info("OutputService: Genearte the output CMP.H5 " +
                         "file using samtoh5.")
            prog = "samtoh5"
            cmd = "samtoh5 {samFile} {refFile} {outFile}".format(
                samFile=inSam, refFile=refFile, outFile=outFile)
            if readType is not None:
                cmd += " -readType {0} ".format(readType)
            if smrtTitle:
                cmd += " -smrtTitle "
            # Execute the command line
            logging.debug("OutputService: Call \"{0}\"".format(cmd))
            output, errCode, errMsg = backticks(cmd)

        if errCode != 0:
            errMsg = prog + " returned a non-zero exit status." + errMsg
            logging.error(errMsg)
            raise RuntimeError(errMsg)
        return output, errCode, errMsg
예제 #13
0
 def SetOutputFileName(self, outputFileName):
     """Validate the user-specified output file and get the absolute and
     expanded path. If output file format is XML or BAM, set output BAM
     filename, BAM index bai file and PacBio BAM index pbi file.
     """
     if outputFileName is not None and outputFileName != "":
         self.outputFileName = checkOutputFile(outputFileName)
         if getFileFormat(self.outputFileName) in [FILE_FORMATS.BAM,
                 FILE_FORMATS.XML]:
             prefix = str(self.outputFileName[0:-3])
             self.outBamFileName = prefix + "bam"
             self.outBaiFileName = self.outBamFileName + ".bai"
             self.outPbiFileName = self.outBamFileName + ".pbi"
예제 #14
0
 def SetOutputFileName(self, outputFileName):
     """Validate the user-specified output file and get the absolute and
     expanded path. If output file format is XML or BAM, set output BAM
     filename, BAM index bai file and PacBio BAM index pbi file.
     """
     if outputFileName is not None and outputFileName != "":
         self.outputFileName = checkOutputFile(outputFileName)
         if getFileFormat(self.outputFileName) in [FILE_FORMATS.BAM,
                 FILE_FORMATS.XML]:
             prefix = str(self.outputFileName[0:-3])
             self.outBamFileName = prefix + "bam"
             self.outBaiFileName = self.outBamFileName + ".bai"
             self.outPbiFileName = self.outBamFileName + ".pbi"
예제 #15
0
    def _output(self, inSam, refFile, outFile, readType=None):
        """Generate a SAM, BAM file.
        Input:
            inSam   : an input SAM/BAM file. (e.g. fileName.filteredSam)
            refFile : the reference file. (e.g. fileName.targetFileName)
            outFile : the output SAM/BAM file
                      (i.e. fileName.outputFileName)
            readType: standard or cDNA or CCS (can be None if not specified)
        Output:
            output, errCode, errMsg
        """
        output, errCode, errMsg = "", 0, ""

        outFormat = getFileFormat(outFile)

        if outFormat == FILE_FORMATS.BAM:
            pass # Nothing to be done
        if outFormat == FILE_FORMATS.SAM:
            logging.info("OutputService: Genearte the output SAM file.")
            logging.debug("OutputService: Move %s as %s", inSam, outFile)
            try:
                shutil.move(real_ppath(inSam), real_ppath(outFile))
            except shutil.Error as e:
                output, errCode, errMsg = "", 1, "Exited with error: " + str(e)
                logging.error(errMsg)
                raise RuntimeError(errMsg)
        elif outFormat == FILE_FORMATS.CMP:
            errMsg = "pbalign no longer supports CMP.H5 Output in 3.0."
            logging.error(errMsg)
            raise IOError(errMsg)
        elif outFormat == FILE_FORMATS.XML:
            logging.info("OutputService: Generating the output XML file %s %s",
                         inSam, outFile)
            # Create {out}.xml, given {out}.bam
            outBam = str(outFile[0:-3]) + "bam"
            aln = None
            # FIXME This should really be more automatic
            if readType == "CCS":
                self._output_dataset_type = ConsensusAlignmentSet
            aln = self._output_dataset_type(real_ppath(outBam))
            for res in aln.externalResources:
                res.reference = refFile
            aln.write(outFile)

        return output, errCode, errMsg
예제 #16
0
    def _output(self, inSam, refFile, outFile, readType=None):
        """Generate a SAM, BAM file.
        Input:
            inSam   : an input SAM/BAM file. (e.g. fileName.filteredSam)
            refFile : the reference file. (e.g. fileName.targetFileName)
            outFile : the output SAM/BAM file
                      (i.e. fileName.outputFileName)
            readType: standard or cDNA or CCS (can be None if not specified)
        Output:
            output, errCode, errMsg
        """
        output, errCode, errMsg = "", 0, ""

        outFormat = getFileFormat(outFile)

        if outFormat == FILE_FORMATS.BAM:
            pass  # Nothing to be done
        if outFormat == FILE_FORMATS.SAM:
            logging.info("OutputService: Genearte the output SAM file.")
            logging.debug("OutputService: Move %s as %s", inSam, outFile)
            try:
                shutil.move(real_ppath(inSam), real_ppath(outFile))
            except shutil.Error as e:
                output, errCode, errMsg = "", 1, "Exited with error: " + str(e)
                logging.error(errMsg)
                raise RuntimeError(errMsg)
        elif outFormat == FILE_FORMATS.CMP:
            errMsg = "pbalign no longer supports CMP.H5 Output in 3.0."
            logging.error(errMsg)
            raise IOError(errMsg)
        elif outFormat == FILE_FORMATS.XML:
            logging.info("OutputService: Generating the output XML file %s %s",
                         inSam, outFile)
            # Create {out}.xml, given {out}.bam
            outBam = str(outFile[0:-3]) + "bam"
            aln = None
            # FIXME This should really be more automatic
            if readType == "CCS":
                self._output_dataset_type = ConsensusAlignmentSet
            aln = self._output_dataset_type(real_ppath(outBam))
            for res in aln.externalResources:
                res.reference = refFile
            aln.write(outFile)

        return output, errCode, errMsg
예제 #17
0
    def _makeSane(self, args, fileNames):
        """
        Check whether the input arguments make sense or not.
        """
        errMsg = ""
        if args.useccs == "useccsdenovo":
            args.readType = "CCS"

        if fileNames.inputFileFormat == FILE_FORMATS.CCS:
            args.readType = "CCS"

        if args.forQuiver:
            if args.useccs is not None:
                errMsg = "Options --forQuiver and --useccs should not " + \
                         "be used together, since Quiver is not designed to " + \
                         "polish ccs reads. if you want to align ccs reads" + \
                         "in cmp.h5 format with pulse QVs loaded, use " + \
                         "--loadQVs with --useccs instead."
                raise ValueError(errMsg)
            args.loadQVs = True

        outFormat = getFileFormat(fileNames.outputFileName)
        if args.loadQVs:
            if fileNames.pulseFileName is None:
                errMsg = "The input file has to be in bas/pls/ccs.h5 " + \
                         "format, or --pulseFile needs to be specified, "
            if outFormat != FILE_FORMATS.CMP:
                errMsg = "The output file has to be in cmp.h5 format, "
            if errMsg != "":
                errMsg += "in order to load pulse QVs."
                logging.error(errMsg)
                raise ValueError(errMsg)

        if outFormat == FILE_FORMATS.BAM or outFormat == FILE_FORMATS.XML:
            if args.algorithm != "blasr":
                errMsg = "Must choose blasr in order to output a bam file."
                raise ValueError(errMsg)
            if args.filterAdapterOnly:
                errMsg = "-filterAdapter does not work when out format is BAM."
                raise ValueError(errMsg)
예제 #18
0
    def _pls2fasta(self, inputFileName, regionTable, noSplitSubreads):
        """ Call pls2fasta to convert a PacBio BASE/PULSe/FOFN file to FASTA.
            Input:
                inputFilieName : a PacBio BASE/PULSE/FOFN file.
                regionTable    : a region table RGN.H5/FOFN file.
                noSplitSubreads: whether to split subreads or not.
            Output:
                a FASTA file which can be used as an input by an aligner.
        """
        # If the incoming file is a FASTA file, no conversion is needed.
        if getFileFormat(inputFileName) == FILE_FORMATS.FASTA:
            return inputFileName

        # Otherwise, create a temporary FASTA file to write.
        outFastaFile = self._tempFileManager.RegisterNewTmpFile(
            suffix=".fasta")

        cmdStr = "pls2fasta {plsFile} {fastaFile} ".format(
            plsFile=inputFileName, fastaFile=outFastaFile)

        if regionTable is not None and regionTable != "":
            cmdStr += " -regionTable {rt} ".format(rt=regionTable)

        if noSplitSubreads:
            cmdStr += " -noSplitSubreads "

        logging.info(self.name + ": Convert {inFile} to FASTA format.".format(
            inFile=inputFileName))
        logging.debug(self.name + ": Call \"{cmd}\"".format(cmd=cmdStr))

        _output, errCode, errMsg = backticks(cmdStr)
        if errCode != 0:
            errMsg += "Failed to convert {i} to {o}.".format(i=inputFileName,
                                                             o=outFastaFile)
            logging.error(errMsg)
            raise RuntimeError(errMsg)

        # Return the converted FASTA file which can be used by an aligner.
        return outFastaFile
예제 #19
0
    def _pls2fasta(self, inputFileName, regionTable, noSplitSubreads):
        """ Call pls2fasta to convert a PacBio BASE/PULSe/FOFN file to FASTA.
            Input:
                inputFilieName : a PacBio BASE/PULSE/FOFN file.
                regionTable    : a region table RGN.H5/FOFN file.
                noSplitSubreads: whether to split subreads or not.
            Output:
                a FASTA file which can be used as an input by an aligner.
        """
        # If the incoming file is a FASTA file, no conversion is needed.
        if getFileFormat(inputFileName) == FILE_FORMATS.FASTA:
            return inputFileName

        # Otherwise, create a temporary FASTA file to write.
        outFastaFile = self._tempFileManager.RegisterNewTmpFile(
            suffix=".fasta")

        cmdStr = "pls2fasta {plsFile} {fastaFile} ".format(
            plsFile=inputFileName, fastaFile=outFastaFile)

        if regionTable is not None and regionTable != "":
            cmdStr += " -regionTable {rt} ".format(rt=regionTable)

        if noSplitSubreads:
            cmdStr += " -noSplitSubreads "

        logging.info(self.name + ": Convert {inFile} to FASTA format.".
                     format(inFile=inputFileName))
        logging.debug(self.name + ": Call \"{cmd}\"".format(cmd=cmdStr))

        _output, errCode, errMsg = backticks(cmdStr)
        if errCode != 0:
            errMsg += "Failed to convert {i} to {o}.".format(
                      i=inputFileName, o=outFastaFile)
            logging.error(errMsg)
            raise RuntimeError(errMsg)

        # Return the converted FASTA file which can be used by an aligner.
        return outFastaFile
예제 #20
0
    def _toCmd(self, options, fileNames, tempFileManager):
        """ Generate a command line for blasr based on options and
            PBAlignFiles, and return a command-line string which can
            be used in bash.
            Input:
                options  : arguments parsed from the command-line, the
                           config file and --algorithmOptions.
                fileNames: an PBAlignFiles object.
                tempFileManager: temporary file manager.
            Output:
                a command-line string which can be used in bash.
        """
        cmdStr = "blasr {queryFile} {targetFile} --out {outFile} ".format(
            queryFile=fileNames.queryFileName,
            targetFile=fileNames.targetFileName,
            outFile=fileNames.alignerSamOut)

        if getFileFormat(fileNames.alignerSamOut) == FILE_FORMATS.BAM:
            cmdStr += " --bam "
        else:
            cmdStr += " --sam "

        if ((fileNames.sawriterFileName is not None)
                and (fileNames.sawriterFileName != "")):
            cmdStr += " --sa {sawriter} ".format(
                sawriter=fileNames.sawriterFileName)

        if ((fileNames.regionTable != "")
                and (fileNames.regionTable is not None)):
            cmdStr += " --regionTable {regionTable} ".format(
                regionTable=fileNames.regionTable)

        if options.maxHits is not None and options.maxHits != "":
            cmdStr += " --bestn {n}".format(n=options.maxHits)

        if (options.minAnchorSize is not None and options.minAnchorSize != ""):
            cmdStr += " --minMatch {0} ".format(options.minAnchorSize)

        if options.nproc is not None and options.nproc != "":
            cmdStr += " --nproc {0} ".format(options.nproc)

        # Specify filter criteira and hit policy.
        if options.minLength is not None:
            cmdStr += " --minSubreadLength {n} --minAlnLength {n} ".\
                    format(n=options.minLength)

        if options.maxDivergence is not None:
            maxDivergence = int(
                options.maxDivergence if options.maxDivergence > 1.0 else (
                    options.maxDivergence * 100))
            cmdStr += " --minPctSimilarity {0}".format(100 - maxDivergence)

        if options.minAccuracy is not None:
            minAccuracy = int(options.minAccuracy if options.minAccuracy > 1.0
                              else (options.minAccuracy * 100))
            cmdStr += " --minPctAccuracy {0}".format(minAccuracy)

        if options.scoreCutoff is not None:
            cmdStr += " --maxScore {0}".format(options.scoreCutoff)

        cmdStr += " --hitPolicy {0} ".format(options.hitPolicy)

        if options.noSplitSubreads:
            cmdStr += " --noSplitSubreads "

        if options.concordant:
            cmdStr += " --concordant "

        if options.seed is not None and options.seed != 0:
            cmdStr += " --randomSeed {0} ".format(options.seed)

        #if options.hitPolicy == "randombest":
        #    cmdStr += " --placeRepeatsRandomly "

        if options.useccs is not None and options.useccs != "":
            cmdStr += " --{0} ".format(options.useccs)

        # When input is a FASTA file, blasr -clipping = soft
        if fileNames.inputFileFormat == FILE_FORMATS.FASTA:
            cmdStr += " --clipping soft "

        if options.algorithmOptions is not None:
            cmdStr += " {0} ".format(options.algorithmOptions)

        if options.unaligned is not None:
            cmdStr += " --unaligned {f} --noPrintUnalignedSeqs".format(
                f=options.unaligned)

        return cmdStr
예제 #21
0
    def _toCmd(self, options, fileNames, tempFileManager):
        """ Generate a command line for blasr based on options and
            PBAlignFiles, and return a command-line string which can
            be used in bash.
            Input:
                options  : arguments parsed from the command-line, the
                           config file and --algorithmOptions.
                fileNames: an PBAlignFiles object.
                tempFileManager: temporary file manager.
            Output:
                a command-line string which can be used in bash.
        """
        cmdStr = "blasr {queryFile} {targetFile} --out {outFile} ".format(
            queryFile=fileNames.queryFileName,
            targetFile=fileNames.targetFileName,
            outFile=fileNames.alignerSamOut)

        if getFileFormat(fileNames.alignerSamOut) == FILE_FORMATS.BAM:
            cmdStr += " --bam "
        else:
            cmdStr += " --sam "

        if ((fileNames.sawriterFileName is not None) and
                (fileNames.sawriterFileName != "")):
            cmdStr += " --sa {sawriter} ".format(
                sawriter=fileNames.sawriterFileName)

        if ((fileNames.regionTable != "") and
                (fileNames.regionTable is not None)):
            cmdStr += " --regionTable {regionTable} ".format(
                regionTable=fileNames.regionTable)

        if options.maxHits is not None and options.maxHits != "":
            cmdStr += " --bestn {n}".format(n=options.maxHits)

        if (options.minAnchorSize is not None and
                options.minAnchorSize != ""):
            cmdStr += " --minMatch {0} ".format(options.minAnchorSize)

        if options.nproc is not None and options.nproc != "":
            cmdStr += " --nproc {0} ".format(options.nproc)

        # Specify filter criteira and hit policy.
        if options.minLength is not None:
            cmdStr += " --minSubreadLength {n} --minAlnLength {n} ".\
                    format(n=options.minLength)

        if options.maxDivergence is not None:
            maxDivergence = int(options.maxDivergence if options.maxDivergence
                                > 1.0 else (options.maxDivergence * 100))
            cmdStr += " --minPctSimilarity {0}".format(100 - maxDivergence)

        if options.minAccuracy is not None:
            minAccuracy = int(options.minAccuracy if options.minAccuracy > 1.0
                              else (options.minAccuracy * 100))
            cmdStr += " --minPctAccuracy {0}".format(minAccuracy)

        if options.scoreCutoff is not None:
            cmdStr += " --maxScore {0}".format(options.scoreCutoff)

        cmdStr += " --hitPolicy {0} ".format(options.hitPolicy)

        if options.noSplitSubreads:
            cmdStr += " --noSplitSubreads "

        if options.concordant:
            cmdStr += " --concordant "

        if options.seed is not None and options.seed != 0:
            cmdStr += " --randomSeed {0} ".format(options.seed)

        #if options.hitPolicy == "randombest":
        #    cmdStr += " --placeRepeatsRandomly "

        if options.useccs is not None and options.useccs != "":
            cmdStr += " --{0} ".format(options.useccs)

        # When input is a FASTA file, blasr -clipping = soft
        if fileNames.inputFileFormat == FILE_FORMATS.FASTA:
            cmdStr += " --clipping soft "

        if options.algorithmOptions is not None:
            cmdStr += " {0} ".format(options.algorithmOptions)

        if options.unaligned is not None:
            cmdStr += " --unaligned {f} --noPrintUnalignedSeqs".format(f=options.unaligned)

        return cmdStr
예제 #22
0
    def run(self):
        """
        The main function, it is called by PBToolRunner.start().
        """
        startTime = time.time()
        logging.info(
            "pbalign version: {version}".format(version=get_version()))
        # FIXME
        #logging.debug("Original arguments: " + str(self._argumentList))

        # Create an AlignService by algorithm name.
        self._alnService = self._createAlignService(self.args.algorithm,
                                                    self.args, self.fileNames,
                                                    self._tempFileManager)

        # Make sane.
        self._makeSane(self.args, self.fileNames)

        # Run align service.
        try:
            self._alnService.run()
        except RuntimeError:
            return 1

        # Create a temporary filtered SAM/BAM file as output for FilterService.
        outFormat = getFileFormat(self.fileNames.outputFileName)
        suffix = ".bam" if outFormat in \
                [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam"
        self.fileNames.filteredSam = self._tempFileManager.\
            RegisterNewTmpFile(suffix=suffix)

        # Call filter service on SAM or BAM file.
        self._filterService = FilterService(
            self.fileNames.alignerSamOut,
            self.fileNames.targetFileName,
            self.fileNames.filteredSam,
            self.args.algorithm,
            #self._alnService.name,
            self._alnService.scoreSign,
            self.args,
            self.fileNames.adapterGffFileName)
        try:
            self._filterService.run()
        except RuntimeError:
            return 1

        # Sort bam before output
        if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]:
            # Sort/make index for BAM output.
            try:
                BamPostService(self.fileNames).run()
            except RuntimeError:
                return 1

        # Output all hits in SAM, BAM or CMP.H5.
        try:
            useSmrtTitle = False
            if (self.args.algorithm != "blasr"
                    or self.fileNames.inputFileFormat == FILE_FORMATS.FASTA):
                useSmrtTitle = True

            self._output(inSam=self.fileNames.filteredSam,
                         refFile=self.fileNames.targetFileName,
                         outFile=self.fileNames.outputFileName,
                         readType=self.args.readType,
                         smrtTitle=useSmrtTitle)
        except RuntimeError:
            return 1

        # Load QVs to cmp.h5 for Quiver
        if outFormat == FILE_FORMATS.CMP and \
            self.args.forQuiver or self.args.loadQVs:
            # Call post service for quiver.
            try:
                ForQuiverService(self.fileNames, self.args).run()
            except RuntimeError:
                return 1

        # Delete temporay files anyway to make
        self._cleanUp(False if (hasattr(self.args, "keepTmpFiles")
                                and self.args.keepTmpFiles is True) else True)

        endTime = time.time()
        logging.info("Total time: {:.2f} s.".format(float(endTime -
                                                          startTime)))
        return 0
예제 #23
0
    def run(self):
        """
        The main function, it is called by PBToolRunner.start().
        """
        startTime = time.time()
        logging.info("pbalign version: {version}".format(version=get_version()))
        # FIXME
        #logging.debug("Original arguments: " + str(self._argumentList))

        # Create an AlignService by algorithm name.
        self._alnService = self._createAlignService(self.args.algorithm,
                                                    self.args,
                                                    self.fileNames,
                                                    self._tempFileManager)

        # Make sane.
        self._makeSane(self.args, self.fileNames)

        # Run align service.
        try:
            self._alnService.run()
        except RuntimeError:
            return 1

        # Create a temporary filtered SAM/BAM file as output for FilterService.
        outFormat = getFileFormat(self.fileNames.outputFileName)
        suffix = ".bam" if outFormat in \
                [FILE_FORMATS.BAM, FILE_FORMATS.XML] else ".sam"
        self.fileNames.filteredSam = self._tempFileManager.\
            RegisterNewTmpFile(suffix=suffix)

        # Call filter service on SAM or BAM file.
        self._filterService = FilterService(self.fileNames.alignerSamOut,
                                            self.fileNames.targetFileName,
                                            self.fileNames.filteredSam,
                                            self.args.algorithm,
                                            #self._alnService.name,
                                            self._alnService.scoreSign,
                                            self.args,
                                            self.fileNames.adapterGffFileName)
        try:
            self._filterService.run()
        except RuntimeError:
            return 1

        # Sort bam before output
        if outFormat in [FILE_FORMATS.BAM, FILE_FORMATS.XML]:
            # Sort/make index for BAM output.
            try:
                BamPostService(self.fileNames).run()
            except RuntimeError:
                return 1

        # Output all hits in SAM, BAM or CMP.H5.
        try:
            useSmrtTitle = False
            if (self.args.algorithm != "blasr" or
                self.fileNames.inputFileFormat == FILE_FORMATS.FASTA):
                useSmrtTitle = True

            self._output(
                inSam=self.fileNames.filteredSam,
                refFile=self.fileNames.targetFileName,
                outFile=self.fileNames.outputFileName,
                readType=self.args.readType,
                smrtTitle=useSmrtTitle)
        except RuntimeError:
            return 1

        # Load QVs to cmp.h5 for Quiver
        if outFormat == FILE_FORMATS.CMP and \
            self.args.forQuiver or self.args.loadQVs:
            # Call post service for quiver.
            try:
                ForQuiverService(self.fileNames, self.args).run()
            except RuntimeError:
                return 1

        # Delete temporay files anyway to make
        self._cleanUp(False if (hasattr(self.args, "keepTmpFiles") and
                               self.args.keepTmpFiles is True) else True)

        endTime = time.time()
        logging.info("Total time: {:.2f} s.".format(float(endTime - startTime)))
        return 0