예제 #1
0
 def initialize_output(self):
     # Create the Output directory
     create_directory(self.output_dir)
     # Create a symbolic link from the data file to the output dir
     baseName = os.path.basename(self.input_file)
     symlinkPath = os.path.join(self.output_dir, baseName)
     if os.path.exists(symlinkPath):
         pass
     else:
         absPath = os.path.abspath(self.input_file)
         os.symlink(absPath, symlinkPath)
     self.sequenceFile = baseName
     # Move into the Output directory and create Log directory and files
     os.chdir(self.output_dir)
     create_directory('log')
     stdoutLog = os.path.join('log', 'mothur_stdout.log')
     stderrLog = os.path.join('log', 'mothur_stderr.log')
     self.log_file = os.path.join('log', 'rna_pipeline.log')
     # Instantiate the MothurRunner object
     self.factory = MothurRunner(self.mothur, self.nproc, stdoutLog,
                                 stderrLog)
예제 #2
0
 def initialize_output(self):
     # Create the Output directory
     create_directory( self.output_dir )
     # Create a symbolic link from the data file to the output dir
     baseName = os.path.basename( self.input_file )
     symlinkPath = os.path.join( self.output_dir, baseName )
     if os.path.exists( symlinkPath ):
         pass
     else:
         absPath = os.path.abspath( self.input_file )
         os.symlink( absPath, symlinkPath )
     self.sequenceFile = baseName
     # Move into the Output directory and create Log directory and files
     os.chdir( self.output_dir )
     create_directory( 'log' )
     stdoutLog = os.path.join('log', 'mothur_stdout.log')
     stderrLog = os.path.join('log', 'mothur_stderr.log')
     self.log_file = os.path.join('log', 'rna_pipeline.log')
     # Instantiate the MothurRunner object
     self.factory = MothurRunner( self.mothur, 
                                  self.nproc, 
                                  stdoutLog, 
                                  stderrLog)
예제 #3
0
class rDnaPipeline( object ):
    """
    A tool for running a community analysis pipeline on PacBioData
    """

    def __init__(self):
        parse_args()
        self.__dict__.update( vars(args) )
        self.validate_settings()
        self.initialize_output()
        if self.debug:
            initialize_logger( self.log_file, logging.DEBUG )
        else:
            initialize_logger( self.log_file, logging.INFO )

    def validate_settings(self):
        # Validate the input file
        root, ext = split_root_from_ext( self.input_file )
        if ext in ['.bas.h5', '.fofn']:
            self.data_type = 'bash5'
        elif ext in ['.fq', '.fastq']:
            self.data_type = 'fastq'
        elif ext in ['.fa', '.fsa', '.fasta']:
            self.data_type = 'fasta'
            self.enable_masking = False
            self.enable_consensus = False
        else:
            raise TypeError('Sequence file must be a bas.h5 file, a ' + \
                            'fasta file, or a fofn of multiple such files')
        # If Clustering was disabled, also disable the consensus process
        if not self.enable_clustering:
            self.enable_consensus = False
        # If Consensus is enabled, initialize the appropriate tool
        if self.enable_consensus:
            self.consensusTool = DagConRunner('gcon.py', 'r')
        # Searching for Mothur executable, and set the Mothur Process counter
        self.mothur = validate_executable( self.mothur )
        self.processCount = 0

    def initialize_output(self):
        # Create the Output directory
        create_directory( self.output_dir )
        # Create a symbolic link from the data file to the output dir
        baseName = os.path.basename( self.input_file )
        symlinkPath = os.path.join( self.output_dir, baseName )
        if os.path.exists( symlinkPath ):
            pass
        else:
            absPath = os.path.abspath( self.input_file )
            os.symlink( absPath, symlinkPath )
        self.sequenceFile = baseName
        # Move into the Output directory and create Log directory and files
        os.chdir( self.output_dir )
        create_directory( 'log' )
        stdoutLog = os.path.join('log', 'mothur_stdout.log')
        stderrLog = os.path.join('log', 'mothur_stderr.log')
        self.log_file = os.path.join('log', 'rna_pipeline.log')
        # Instantiate the MothurRunner object
        self.factory = MothurRunner( self.mothur, 
                                     self.nproc, 
                                     stdoutLog, 
                                     stderrLog)

    def getProcessLogFile(self, process, isMothurProcess=False):
        if isMothurProcess:
            logFile = 'process%02d.mothur.%s.logfile' % (self.processCount, 
                                                         process)
        else:
            logFile = 'process%02d.%s.logfile' % (self.processCount, process)
        return os.path.join('log', logFile)

    def process_setup(self, inputFile, processName, suffix=None, suffixList=None):
        """ 
        Return a tuple containing the output file and a boolean flag describing
        whether the output file already exists
        """
        log.info('Preparing to run %s on "%s"' % (processName, inputFile))
        self.processCount += 1
        if suffix:
            outputFile = get_output_name(inputFile, suffix)
            return outputFile
        elif suffixList:
            outputFiles = []
            for suffix in suffixList:
                outputFile = get_output_name( inputFile, suffix )
                outputFiles.append( outputFile )
            return outputFiles

    def output_files_exist( self, outputFile=None, outputList=None ):
        if outputFile:
            if file_exists( outputFile ):
                log.info('Output files detected, skipping process...\n')
                return True
            else:
                log.info('Output files not found, running process...')
                return False
        elif outputList:
            if all_files_exist( outputList ):
                log.info('Output files detected, skipping process...\n')
                return True
            else:
                log.info('Output files not found, running process...')
                return False

    def checkOutputFile( self, outputFile ):
        if file_exists( outputFile ):
            log.info('Expected output "%s" found' % outputFile)
        else:
            msg = 'Expected output "%s" not found!' % outputFile
            log.info( msg )
            raise IOError( msg )

    def process_cleanup(self, outputFile=None, outputList=None):
        """
        Log if the process successfully created it's output, and raise an
        error message if not
        """
        if outputFile:
            self.checkOutputFile( outputFile )
        elif outputList:
            for outputFile in outputList:
                self.checkOutputFile( outputFile )
        log.info('All expected output files found - process successful!\n')

    def write_dummy_file(self, dummyFile):
        with open(dummyFile, 'w') as handle:
            handle.write('DONE')
        return dummyFile

    def extract_raw_ccs(self, inputFile):
        outputFile = self.process_setup( inputFile, 
                                         'extractCcsFromBasH5',
                                         suffix='fastq' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        elif file_has_ccs( inputFile ):
            extract_ccs(inputFile, outputFile, self.raw_data)
        else:
            msg = 'Raw data file has no CCS data!'
            log.error( msg )
            raise ValueError( msg )
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def filter_fastq(self, fastqFile):
        outputFile = self.process_setup( fastqFile, 
                                         'FilterQuality',
                                         suffix='filter.fastq' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        quality_filter( fastqFile, outputFile )
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def separate_fastq(self, fastqFile):
        outputList = self.process_setup( fastqFile, 
                                        'Fastq.Info', 
                                        suffixList=['fasta', 'qual'] )
        if self.output_files_exist( outputList=outputList ):
            return outputList
        mothurArgs = {'fastq':fastqFile, 'fasta':'T', 'qfile':'T'}
        logFile = self.getProcessLogFile('fastq.info', True)
        self.factory.runJob('fastq.info', mothurArgs, logFile)
        self.process_cleanup( outputList=outputList )
        return outputList

    def align_sequences(self, fastaFile):
        outputFile = self.process_setup( fastaFile, 
                                        'Align.Seqs', 
                                        suffix='align' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':fastaFile,
                      'reference':self.alignment_reference,
                      'flip':'t'}
        logFile = self.getProcessLogFile('align.seqs', True)
        self.factory.runJob('align.seqs', mothurArgs, logFile)
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def screen_sequences(self, alignFile, start=None, end=None, min_length=None):
        if alignFile.endswith('.align'):
            outputExt = 'good.align'
        elif alignFile.endswith('.fasta'):
            outputExt = 'good.fasta'
        outputFile = self.process_setup( alignFile, 
                                         'Screen.Seqs', 
                                         suffix=outputExt )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'start':start,
                      'end':end,
                      'minlength':min_length}
        logFile = self.getProcessLogFile('screen.seqs', True)
        self.factory.runJob('screen.seqs', mothurArgs, logFile)
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def summarize_sequences(self, fastaFile):
        outputFile = self.process_setup( fastaFile, 
                                        'Summary.Seqs', 
                                        suffix='summary' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':fastaFile}
        logFile = self.getProcessLogFile('summary.seqs', True)
        self.factory.runJob('summary.seqs', mothurArgs, logFile)
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def parse_summary_file(self, summaryFile):
        log.info('Preparing to run SummaryReader...')
        parser = SummaryReader(summaryFile, self.fraction)
        log.info('Identifying full-length alignment positions...')
        start, end = parser.getFullLengthPositions()
        log.info('Full-length start is NAST Alignment position %s' % start)
        log.info('Full-length end is NAST Alignment position %s' % end)
        log.info('Calculating minimum allowed alignment positions...')
        maxStart, minEnd = parser.getAllowedPositions()
        log.info('Maximum allowed start is NAST Alignment position %s' % maxStart)
        log.info('Minimum allowed end is NAST Alignment position %s\n' % minEnd)
        return maxStart, minEnd

    def find_chimeras(self, alignFile):
        outputFile = self.process_setup( alignFile, 
                                        'UCHIME', 
                                        suffix='uchime.accnos' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'reference':self.chimera_reference}
        logFile = self.getProcessLogFile('chimera.uchime', True)
        self.factory.runJob('chimera.uchime', mothurArgs, logFile)
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def remove_sequences(self, alignFile, idFile):
        outputFile = self.process_setup( alignFile, 
                                        'Remove.Seqs', 
                                        suffix='pick.align' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'accnos':idFile}
        logFile = self.getProcessLogFile('remove.seqs', True)
        self.factory.runJob('remove.seqs', mothurArgs, logFile)
        self.process_cleanup( outputFile=outputFile )
        return outputFile
  
    def filter_sequences(self, alignFile):
        outputFile = self.process_setup( alignFile, 
                                        'Filter.Seqs', 
                                        suffix='filter.fasta' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'vertical':'T'}
        logFile = self.getProcessLogFile( 'filter.seqs', True )
        self.factory.runJob( 'filter.seqs', mothurArgs, logFile )
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def add_quality_to_alignment(self, fastqFile, alignFile):
        outputFile = self.process_setup( alignFile, 
                                        'QualityAligner', 
                                        suffix='fastq' )
        if self.output_files_exist( outputFile=output ):
            return output
        aligner = QualityAligner( fastqFile, alignFile, outputFile )
        aligner.run()
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def mask_fastq_sequences(self, fastqFile):
        outputFile = self.process_setup( fastqFile, 
                                        'QualityMasker', 
                                        suffix='masked.fastq' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        masker = QualityMasker(fastqFile, outputFile, self.minQv)
        masker.run()
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def unique_sequences( self, alignFile ):
        if alignFile.endswith('.align'):
            outputSuffixes = ['unique.align', 'names']
        elif alignFile.endswith('.fasta'):
            outputSuffixes = ['unique.fasta', 'names']
        outputList = self.process_setup( alignFile,
                                        'Unique.Seqs',
                                        suffixList=outputSuffixes )
        if self.output_files_exist( outputList=outputList ):
            return outputList
        mothurArgs = {'fasta':alignFile}
        logFile = self.getProcessLogFile('unique.seqs', True)
        self.factory.runJob('unique.seqs', mothurArgs, logFile)
        self.process_cleanup( outputList=outputList )
        return outputList

    def precluster_sequences( self, alignFile, nameFile ):
        if alignFile.endswith('.align'):
            outputSuffixes = ['precluster.align', 'precluster.names']
        elif alignFile.endswith('.fasta'):
            outputSuffixes = ['precluster.fasta', 'precluster.names']
        outputList = self.process_setup( alignFile,
                                        'Pre.Cluster',
                                        suffixList=outputSuffixes )
        if self.output_files_exist( outputList=outputList ):
            return outputList
        mothurArgs = { 'fasta':alignFile,
                       'name': nameFile,
                       'diffs':self.precluster_diffs }
        logFile = self.getProcessLogFile('pre.cluster', True)
        self.factory.runJob('pre.cluster', mothurArgs, logFile)
        self.process_cleanup( outputList=outputList )
        return outputList

    def calculate_distance_matrix( self, alignFile ):
        outputFile = self.process_setup( alignFile, 
                                        'Dist.Seqs', 
                                        suffix='phylip.dist' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        mothurArgs = { 'fasta':alignFile,
                       'calc':'onegap',
                       'countends':'F',
                       'output':'lt' }
        logFile = self.getProcessLogFile('dist.seqs', True)
        self.factory.runJob('dist.seqs', mothurArgs, logFile)
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def cluster_sequences(self, distanceMatrix):
        if self.clusteringMethod == 'nearest':
            outputSuffix = 'nn.list'
        elif self.clusteringMethod == 'average':
            outputSuffix = 'an.list'
        elif self.clusteringMethod == 'furthest':
            outputSuffix = 'fn.list'
        outputFile = self.process_setup( distanceMatrix, 
                                        'Cluster', 
                                        suffix=outputSuffix )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'phylip':distanceMatrix,
                      'method':self.clusteringMethod}
        logFile = self.getProcessLogFile( 'cluster', True )
        self.factory.runJob( 'cluster', mothurArgs, logFile )
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def separate_cluster_sequences(self, listFile, sequenceFile):
        outputFile = self.process_setup( listFile, 
                                        'ClusterSeparator', 
                                        suffix='list.clusters')
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        separator = ClusterSeparator( listFile, 
                                      sequenceFile,
                                      outputFile,
                                      self.distance, 
                                      self.min_cluster_size )
        separator()
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def generate_consensus_sequences(self, clusterListFile):
        outputFile = self.process_setup( clusterListFile, 
                                        'ClusterResequencer', 
                                        suffix='consensus')
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        consensusFiles = []
        with open( clusterListFile ) as handle:
            for line in handle:
                sequenceFile, referenceFile, count = line.strip().split()
                if referenceFile.endswith('None'):
                    consensusFiles.append( (sequenceFile, 'None') )
                else:
                    root_name = os.path.basename( sequenceFile )
                    consensus = self.consensusTool( sequenceFile, 
                                                    referenceFile )
                    consensusFiles.append( (referenceFile, consensus) )
        with open( outputFile, 'w' ) as handle:
            for filenamePair in consensusFiles:
                handle.write('%s\t%s\n' % filenamePair)
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def cleanup_consensus_folder( self, consensusFile ):
        outputFile = self.process_setup( consensusFile, 
                                        'ConsensusCleanup', 
                                        suffix='consensus.cleanup' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        reseqPath = os.path.join( os.getcwd(), 'reseq' )
        for filename in os.listdir( reseqPath ):
            filePath = os.path.join( reseqPath, filename )
            if filePath.endswith('_input.fa'):
                os.remove( filePath )
            elif filePath.endswith('_input.fa.aln'):
                os.remove( filePath )
            elif filePath.endswith('_input.fa.aln_unsorted'):
                os.remove( filePath )
        self.write_dummy_file( outputFile )
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def select_final_sequences( self, consensusFile ):
        outputFile = self.process_setup( consensusFile, 
                                        'SequenceSelector', 
                                        suffix='consensus.selected' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        selectedFiles = []
        with open( consensusFile ) as handle:
            for line in handle:
                referenceFile, consensusFile = line.strip().split()
                if consensusFile.endswith('None'):
                    pass
                elif fasta_count( consensusFile ) == 1:
                    selectedFiles.append( consensusFile )
                else:
                    selectedFiles.append( referenceFile )
        with open( outputFile, 'w' ) as handle:
            for filename in selectedFiles:
                handle.write(filename + '\n')
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def output_final_sequences( self, finalSequenceList ):
        outputFile = self.process_setup( finalSequenceList, 
                                        'SequenceWriter',
                                        suffix='fasta' )
        if self.output_files_exist( outputFile=outputFile ):
            return outputFile
        with FastaWriter( outputFile ) as writer:
            with open( finalSequenceList ) as handle:
                for line in handle:
                    sequenceFile = line.strip()
                    copy_fasta_sequences( sequenceFile, writer )
        self.process_cleanup( outputFile=outputFile )
        return outputFile

    def run(self):
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs( self.sequenceFile )
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile
        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq( fastqFile )
            fastaFile, qualFile = self.separate_fastq( filteredFastq )
        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences( fastaFile )
        summaryFile = self.summarize_sequences( alignedFile )
        maxStart, minEnd = self.parse_summary_file( summaryFile )
        screenedFile = self.screen_sequences(alignedFile, 
                                            start=maxStart,
                                            end=minEnd)
        # Identify and remove chimeric reads
        chimeraIds = self.find_chimeras( screenedFile )
        noChimeraFile = self.remove_sequences( screenedFile, chimeraIds )
        # Filter out un-used columns to speed up re-alignment and clustering
        filteredFile = self.filter_sequences( noChimeraFile )
        # If masking is enabled, create an aligned FASTQ, mask the 
        # low-quality bases and remove over-masked reads
        if self.enable_masking:
            alignedFastqFile = self.add_quality_to_alignment( fastqFile, filteredFile )
            maskedFastq = self.mask_fastq_sequences( alignedFastqFile )
            maskedFasta = self.convert_fastq_to_fasta( maskedFastq )
            screenedFasta = self.screen_sequences( maskedFasta,
                                                  min_length=self.min_length)
            fileForClustering = screenedFasta
        # Otherwise if masking is disabled, we'll use unique-ify and 
        #    pre-cluster our sequences
        else:
            uniqueFile, nameFile = self.unique_sequences( filteredFile )
            preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile )
            fileForClustering = preclusteredFile
        # If enabled, calculate sequence distances and cluster
        if self.enable_clustering:
            distanceMatrix = self.calculate_distance_matrix( fileForClustering )
            listFile = self.cluster_sequences( distanceMatrix )
        # If enabled, generate a consensus for each cluster from above
        if self.enable_consensus:
            clusterListFile = self.separate_cluster_sequences( listFile, fastqFile )
            consensusFile = self.generate_consensus_sequences( clusterListFile )
            self.cleanup_consensus_folder( consensusFile )
            selectedFile = self.select_final_sequences( consensusFile )
            finalFile = self.output_final_sequences( selectedFile )
예제 #4
0
class rDnaPipeline(object):
    """
    A tool for running a community analysis pipeline on PacBioData
    """
    def __init__(self):
        parse_args()
        self.__dict__.update(vars(args))
        self.validate_settings()
        self.initialize_output()
        initialize_logger(log, log_file=self.log_file, debug=self.debug)

    def validate_settings(self):
        # Validate the input file
        root, ext = split_root_from_ext(self.input_file)
        if ext in ['.bas.h5', '.fofn']:
            self.data_type = 'bash5'
        elif ext in ['.fq', '.fastq']:
            self.data_type = 'fastq'
        elif ext in ['.fa', '.fsa', '.fasta']:
            self.data_type = 'fasta'
        else:
            raise TypeError('Sequence file must be a bas.h5 file, a ' + \
                            'fasta file, or a fofn of multiple such files')

        self.step_list = self.calculate_steps()

        if self.enable_consensus:
            self.consensusTool = DagConRunner('gcon.py', 'r')

        # Searching for Mothur executable, and set the Mothur Process counter
        self.mothur = validate_executable(self.mothur)
        self.processCount = 0

    def initialize_output(self):
        # Create the Output directory
        create_directory(self.output_dir)
        # Create a symbolic link from the data file to the output dir
        baseName = os.path.basename(self.input_file)
        symlinkPath = os.path.join(self.output_dir, baseName)
        if os.path.exists(symlinkPath):
            pass
        else:
            absPath = os.path.abspath(self.input_file)
            os.symlink(absPath, symlinkPath)
        self.sequenceFile = baseName
        # Move into the Output directory and create Log directory and files
        os.chdir(self.output_dir)
        create_directory('log')
        stdoutLog = os.path.join('log', 'mothur_stdout.log')
        stderrLog = os.path.join('log', 'mothur_stderr.log')
        self.log_file = os.path.join('log', 'rna_pipeline.log')
        # Instantiate the MothurRunner object
        self.factory = MothurRunner(self.mothur, self.nproc, stdoutLog,
                                    stderrLog)

    def calculate_steps(self):
        if self.enable_iteration:
            count = int(self.distance / self.step) - 1
            step_list = [i * self.step for i in range(1, count + 1)]
        else:
            step_list = []
        return step_list + [self.distance]

    def getProcessLogFile(self, process, isMothurProcess=False):
        if isMothurProcess:
            logFile = 'process%02d.mothur.%s.logfile' % (self.processCount,
                                                         process)
        else:
            logFile = 'process%02d.%s.logfile' % (self.processCount, process)
        return os.path.join('log', logFile)

    def process_setup(self,
                      inputFile,
                      processName,
                      suffix=None,
                      suffixList=None):
        """ 
        Return a tuple containing the output file and a boolean flag describing
        whether the output file already exists
        """
        log.info('Preparing to run %s on "%s"' % (processName, inputFile))
        self.processCount += 1
        if suffix:
            outputFile = get_output_name(inputFile, suffix)
            return outputFile
        elif suffixList:
            outputFiles = []
            for suffix in suffixList:
                outputFile = get_output_name(inputFile, suffix)
                outputFiles.append(outputFile)
            return outputFiles

    def output_files_exist(self, output_file=None, output_list=None):
        if output_file:
            if file_exists(output_file):
                log.info('Output files detected, skipping process...\n')
                return True
            else:
                log.info('Output files not found, running process...')
                return False
        elif output_list:
            if all_files_exist(output_list):
                log.info('Output files detected, skipping process...\n')
                return True
            else:
                log.info('Output files not found, running process...')
                return False

    def check_output_file(self, outputFile):
        if os.path.exists(outputFile):
            log.info('Expected output "%s" found' % outputFile)
        else:
            msg = 'Expected output "%s" not found!' % outputFile
            log.error(msg)
            raise IOError(msg)

    def process_cleanup(self, output_file=None, output_list=None):
        """
        Log if the process successfully created it's output, and raise an
        error message if not
        """
        if output_file:
            self.check_output_file(output_file)
        elif output_list:
            for output_file in output_list:
                self.check_output_file(output_file)
        log.info('All expected output files found - process successful!\n')

    def extract_raw_ccs(self, inputFile):
        outputFile = self.process_setup(inputFile,
                                        'extractCcsFromBasH5',
                                        suffix='fastq')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        elif file_has_ccs(inputFile):
            extract_ccs(inputFile, outputFile, self.raw_data)
        else:
            msg = 'Raw data file has no CCS data!'
            log.error(msg)
            raise ValueError(msg)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def filter_fastq(self, fastqFile):
        outputFile = self.process_setup(fastqFile,
                                        'FilterQuality',
                                        suffix='filter.fastq')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        quality_filter(fastqFile, outputFile, min_accuracy=self.min_accuracy)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def separate_fastq(self, fastqFile):
        outputList = self.process_setup(fastqFile,
                                        'Fastq.Info',
                                        suffixList=['fasta', 'qual'])
        if self.output_files_exist(output_list=outputList):
            return outputList
        mothurArgs = {'fastq': fastqFile, 'fasta': 'T', 'qfile': 'T'}
        logFile = self.getProcessLogFile('fastq.info', True)
        self.factory.runJob('fastq.info', mothurArgs, logFile)
        self.process_cleanup(output_list=outputList)
        return outputList

    def align_sequences(self, fastaFile):
        outputFile = self.process_setup(fastaFile,
                                        'Align.Seqs',
                                        suffix='align')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {
            'fasta': fastaFile,
            'reference': self.alignment_reference,
            'flip': 't'
        }
        logFile = self.getProcessLogFile('align.seqs', True)
        self.factory.runJob('align.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def screen_sequences(self,
                         alignFile,
                         start=None,
                         end=None,
                         min_length=None):
        if alignFile.endswith('.align'):
            outputExt = 'good.align'
        elif alignFile.endswith('.fasta'):
            outputExt = 'good.fasta'
        outputFile = self.process_setup(alignFile,
                                        'Screen.Seqs',
                                        suffix=outputExt)
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {
            'fasta': alignFile,
            'start': start,
            'end': end,
            'minlength': min_length
        }
        logFile = self.getProcessLogFile('screen.seqs', True)
        self.factory.runJob('screen.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def summarize_sequences(self, fastaFile):
        outputFile = self.process_setup(fastaFile,
                                        'Summary.Seqs',
                                        suffix='summary')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta': fastaFile}
        logFile = self.getProcessLogFile('summary.seqs', True)
        self.factory.runJob('summary.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def parse_summary_file(self, summaryFile):
        log.info('Preparing to run SummaryReader...')
        parser = SummaryReader(summaryFile, self.fraction)
        log.info('Identifying full-length alignment positions...')
        start, end = parser.getFullLengthPositions()
        log.info('Full-length start is NAST Alignment position %s' % start)
        log.info('Full-length end is NAST Alignment position %s' % end)
        log.info('Calculating minimum allowed alignment positions...')
        maxStart, minEnd = parser.getAllowedPositions()
        log.info('Maximum allowed start is NAST Alignment position %s' %
                 maxStart)
        log.info('Minimum allowed end is NAST Alignment position %s\n' %
                 minEnd)
        return maxStart, minEnd

    def find_chimeras(self, alignFile):
        outputFile = self.process_setup(alignFile,
                                        'UCHIME',
                                        suffix='uchime.accnos')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta': alignFile, 'reference': self.chimera_reference}
        logFile = self.getProcessLogFile('chimera.uchime', True)
        self.factory.runJob('chimera.uchime', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def remove_sequences(self, alignFile, idFile):
        outputFile = self.process_setup(alignFile,
                                        'Remove.Seqs',
                                        suffix='pick.align')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta': alignFile, 'accnos': idFile}
        logFile = self.getProcessLogFile('remove.seqs', True)
        self.factory.runJob('remove.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def filter_sequences(self, alignFile, trump=None):
        outputFile = self.process_setup(alignFile,
                                        'Filter.Seqs',
                                        suffix='filter.fasta')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta': alignFile, 'vertical': 'T', 'trump': trump}
        logFile = self.getProcessLogFile('filter.seqs', True)
        self.factory.runJob('filter.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def add_quality_to_alignment(self, fastqFile, alignFile):
        outputFile = self.process_setup(alignFile,
                                        'QualityAligner',
                                        suffix='fastq')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        aligner = QualityAligner(fastqFile, alignFile, outputFile)
        aligner.run()
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def mask_fastq_sequences(self, fastqFile):
        outputFile = self.process_setup(fastqFile,
                                        'QualityMasker',
                                        suffix='masked.fastq')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        masker = QualityMasker(fastqFile, outputFile, self.minQv)
        masker.run()
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def unique_sequences(self, alignFile):
        if alignFile.endswith('.align'):
            outputSuffixes = ['unique.align', 'names']
        elif alignFile.endswith('.fasta'):
            outputSuffixes = ['unique.fasta', 'names']
        outputList = self.process_setup(alignFile,
                                        'Unique.Seqs',
                                        suffixList=outputSuffixes)
        if self.output_files_exist(output_list=outputList):
            return outputList
        mothurArgs = {'fasta': alignFile}
        logFile = self.getProcessLogFile('unique.seqs', True)
        self.factory.runJob('unique.seqs', mothurArgs, logFile)
        self.process_cleanup(output_list=outputList)
        return outputList

    def precluster_sequences(self, alignFile, nameFile):
        if alignFile.endswith('.align'):
            outputSuffixes = ['precluster.align', 'precluster.names']
        elif alignFile.endswith('.fasta'):
            outputSuffixes = ['precluster.fasta', 'precluster.names']
        outputList = self.process_setup(alignFile,
                                        'Pre.Cluster',
                                        suffixList=outputSuffixes)
        if self.output_files_exist(output_list=outputList):
            return outputList
        mothurArgs = {
            'fasta': alignFile,
            'name': nameFile,
            'diffs': self.precluster_diffs
        }
        logFile = self.getProcessLogFile('pre.cluster', True)
        self.factory.runJob('pre.cluster', mothurArgs, logFile)
        self.process_cleanup(output_list=outputList)
        return outputList

    def calculate_distance_matrix(self, alignFile):
        outputFile = self.process_setup(alignFile,
                                        'Dist.Seqs',
                                        suffix='phylip.dist')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {
            'fasta': alignFile,
            'calc': 'onegap',
            'countends': 'F',
            'output': 'lt'
        }
        logFile = self.getProcessLogFile('dist.seqs', True)
        self.factory.runJob('dist.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def cluster_sequences(self, distanceMatrix, nameFile):
        if self.clusteringMethod == 'nearest':
            outputSuffix = 'nn.list'
        elif self.clusteringMethod == 'average':
            outputSuffix = 'an.list'
        elif self.clusteringMethod == 'furthest':
            outputSuffix = 'fn.list'
        outputFile = self.process_setup(distanceMatrix,
                                        'Cluster',
                                        suffix=outputSuffix)
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {
            'phylip': distanceMatrix,
            'name': nameFile,
            'method': self.clusteringMethod
        }
        logFile = self.getProcessLogFile('cluster', True)
        self.factory.runJob('cluster', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def separate_cluster_sequences(self, listFile, sequenceFile, distance,
                                   min_cluster_size):
        outputFile = self.process_setup(listFile,
                                        'ClusterSeparator',
                                        suffix='clusters')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        outputDir = 'Dist_%s' % distance
        separator = ClusterSeparator(listFile, sequenceFile, outputFile,
                                     outputDir, distance, min_cluster_size)
        separator()
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def generate_consensus_sequences(self, cluster_list_file, distance):
        output_file = self.process_setup(cluster_list_file,
                                         'ClusterResequencer',
                                         suffix='consensus')
        if self.output_files_exist(output_file=output_file):
            return output_file
        generate_consensus_files(cluster_list_file, self.consensusTool,
                                 output_file)
        self.process_cleanup(output_file=output_file)
        return output_file

    def generate_ref_sequences(self, cluster_list_file, distance):
        output_file = self.process_setup(cluster_list_file,
                                         'ClusterResequencer',
                                         suffix='consensus')
        if self.output_files_exist(output_file=output_file):
            return output_file
        generate_reference_files(cluster_list_file, output_file)
        self.process_cleanup(output_file=output_file)
        return output_file

    def cleanup_uchime_output(self, screenedFile):
        outputFile = self.process_setup(screenedFile,
                                        'UchimeCleanup',
                                        suffix='uchime.cleanup')
        uchimePath = os.getcwd()
        for filename in os.listdir(uchimePath):
            if filename.endswith('_formatted'):
                file_path = os.path.join(uchimePath, filename)
                os.remove(file_path)
        write_dummy_file(outputFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def cleanup_consensus_folder(self, consensusFile, distance):
        outputFile = self.process_setup(consensusFile,
                                        'ConsensusCleanup',
                                        suffix='consensus.cleanup')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        reseqPath = os.path.join(os.getcwd(), 'Dist_%s' % distance)
        clean_consensus_outputs(reseqPath, outputFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def select_sequences(self, consensusFile):
        outputFile = self.process_setup(consensusFile,
                                        'SequenceSelector',
                                        suffix='consensus.selected')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        select_consensus_files(consensusFile, outputFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def select_ref_sequences(self, consensusFile):
        outputFile = self.process_setup(consensusFile,
                                        'SequenceSelector',
                                        suffix='consensus.selected')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        select_reference_files(consensusFile, outputFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def output_selected_sequences(self, selectedSequences):
        outputFile = self.process_setup(selectedSequences,
                                        'SequenceWriter',
                                        suffix='fasta')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        copy_fasta_list(selectedSequences, outputFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def write_name_file(self, consensusFile, selectedFile, outputRoot=None):
        if outputRoot is None:
            outputRoot = 'Final_Output.fasta'
        outputFile = self.process_setup(outputRoot,
                                        'CreateNameFile',
                                        suffix='names')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        create_name_file(consensusFile, selectedFile, outputFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def run(self):
        print self.step_list
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs(self.sequenceFile)
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile

        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq(fastqFile)
            fastaFile, qualFile = self.separate_fastq(filteredFastq)

        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences(fastaFile)
        summaryFile = self.summarize_sequences(alignedFile)
        maxStart, minEnd = self.parse_summary_file(summaryFile)
        screenedFile = self.screen_sequences(alignedFile,
                                             start=maxStart,
                                             end=minEnd)

        # Identify and remove chimeric reads
        chimera_ids = self.find_chimeras(screenedFile)
        self.cleanup_uchime_output(screenedFile)
        if file_exists(chimera_ids):
            no_chimera_file = self.remove_sequences(screenedFile, chimera_ids)
        else:
            no_chimera_file = screenedFile

        filteredFile = self.filter_sequences(no_chimera_file, trump='.')
        uniqueFile, nameFile = self.unique_sequences(filteredFile)
        preclusteredFile, nameFile = self.precluster_sequences(
            uniqueFile, nameFile)
        fileToCluster = preclusteredFile

        clusterFileRoot = '.'.join(fileToCluster.split('.')[:-1])
        for i, step in enumerate(self.step_list):
            log.info("Beginning iteration #%s - %s" % (i + 1, step))
            iterationInput = clusterFileRoot + '.%s.fasta' % step
            shutil.copyfile(fileToCluster, iterationInput)
            distanceMatrix = self.calculate_distance_matrix(iterationInput)
            listFile = self.cluster_sequences(distanceMatrix, nameFile)

            # Include all clusters during intermediate stages, others use min_cluster_size
            if step == self.distance:
                clusterListFile = self.separate_cluster_sequences(
                    listFile, fastqFile, step, self.min_cluster_size)
            else:
                clusterListFile = self.separate_cluster_sequences(
                    listFile, fastqFile, step, 1)

            # Generate the consensus sequences for the next round
            if step == self.distance and self.enable_consensus:
                # If consensus is enabled and this is the last round, generate a GCON consensus
                log.info(
                    "Generating consensus sequences for iteration #%s - %s" %
                    (i + 1, step))
                consensusFile = self.generate_consensus_sequences(
                    clusterListFile, step)
                self.cleanup_consensus_folder(consensusFile, step)
                selectedFile = self.select_sequences(consensusFile)
                selectedSequenceFile = self.output_selected_sequences(
                    selectedFile)
            else:
                # Otherwise generate reference sequences by picking high-QV reads
                log.info(
                    "Selecting reference sequences for iteration #%s - %s" %
                    (i + 1, step))
                consensusFile = self.generate_ref_sequences(
                    clusterListFile, step)
                selectedFile = self.select_ref_sequences(consensusFile)
                selectedSequenceFile = self.output_selected_sequences(
                    selectedFile)

            # Whichever method was used, we need to update the nameFile accordingly
            nameFile = self.write_name_file(consensusFile, selectedFile,
                                            selectedSequenceFile)

            # If this isn't the last round, we must re-align and re-filter the new consensus sequences
            if step != self.distance:
                log.info(
                    "Iterative clustering not finished, preparing sequences for next iteration"
                )
                alignedFile = self.align_sequences(selectedSequenceFile)
                fileToCluster = self.filter_sequences(alignedFile, trump='.')
            log.info("Finished iteration #%s - %s" % (i + 1, step))

        try:
            os.symlink(selectedSequenceFile, "Final_Output.fasta")
        except:
            pass

        try:
            os.symlink(nameFile, "Final_Output.names")
        except:
            pass
예제 #5
0
class rDnaPipeline( object ):
    """
    A tool for running a community analysis pipeline on PacBioData
    """

    def __init__(self):
        parse_args()
        self.__dict__.update( vars(args) )
        self.validate_settings()
        self.initialize_output()
        initialize_logger( log, log_file=self.log_file, debug=self.debug )

    def validate_settings(self):
        # Validate the input file
        root, ext = split_root_from_ext( self.input_file )
        if ext in ['.bas.h5', '.fofn']:
            self.data_type = 'bash5'
        elif ext in ['.fq', '.fastq']:
            self.data_type = 'fastq'
        elif ext in ['.fa', '.fsa', '.fasta']:
            self.data_type = 'fasta'
        else:
            raise TypeError('Sequence file must be a bas.h5 file, a ' + \
                            'fasta file, or a fofn of multiple such files')

        self.step_list = self.calculate_steps()

        if self.enable_consensus:
            self.consensusTool = DagConRunner('gcon.py', 'r')

        # Searching for Mothur executable, and set the Mothur Process counter
        self.mothur = validate_executable( self.mothur )
        self.processCount = 0

    def initialize_output(self):
        # Create the Output directory
        create_directory( self.output_dir )
        # Create a symbolic link from the data file to the output dir
        baseName = os.path.basename( self.input_file )
        symlinkPath = os.path.join( self.output_dir, baseName )
        if os.path.exists( symlinkPath ):
            pass
        else:
            absPath = os.path.abspath( self.input_file )
            os.symlink( absPath, symlinkPath )
        self.sequenceFile = baseName
        # Move into the Output directory and create Log directory and files
        os.chdir( self.output_dir )
        create_directory( 'log' )
        stdoutLog = os.path.join('log', 'mothur_stdout.log')
        stderrLog = os.path.join('log', 'mothur_stderr.log')
        self.log_file = os.path.join('log', 'rna_pipeline.log')
        # Instantiate the MothurRunner object
        self.factory = MothurRunner( self.mothur, 
                                     self.nproc, 
                                     stdoutLog, 
                                     stderrLog)

    def calculate_steps(self):
        if self.enable_iteration:
            count = int(self.distance / self.step) - 1
            step_list = [i * self.step for i in range(1, count+1)]
        else:
            step_list = []
        return step_list + [self.distance]

    def getProcessLogFile(self, process, isMothurProcess=False):
        if isMothurProcess:
            logFile = 'process%02d.mothur.%s.logfile' % (self.processCount, 
                                                         process)
        else:
            logFile = 'process%02d.%s.logfile' % (self.processCount, process)
        return os.path.join('log', logFile)

    def process_setup(self, inputFile, processName, suffix=None, suffixList=None):
        """ 
        Return a tuple containing the output file and a boolean flag describing
        whether the output file already exists
        """
        log.info('Preparing to run %s on "%s"' % (processName, inputFile))
        self.processCount += 1
        if suffix:
            outputFile = get_output_name(inputFile, suffix)
            return outputFile
        elif suffixList:
            outputFiles = []
            for suffix in suffixList:
                outputFile = get_output_name( inputFile, suffix )
                outputFiles.append( outputFile )
            return outputFiles

    def output_files_exist(self, output_file=None, output_list=None):
        if output_file:
            if file_exists( output_file ):
                log.info('Output files detected, skipping process...\n')
                return True
            else:
                log.info('Output files not found, running process...')
                return False
        elif output_list:
            if all_files_exist( output_list ):
                log.info('Output files detected, skipping process...\n')
                return True
            else:
                log.info('Output files not found, running process...')
                return False

    def check_output_file( self, outputFile ):
        if os.path.exists( outputFile ):
            log.info('Expected output "%s" found' % outputFile)
        else:
            msg = 'Expected output "%s" not found!' % outputFile
            log.error( msg )
            raise IOError( msg )

    def process_cleanup(self, output_file=None, output_list=None):
        """
        Log if the process successfully created it's output, and raise an
        error message if not
        """
        if output_file:
            self.check_output_file( output_file )
        elif output_list:
            for output_file in output_list:
                self.check_output_file( output_file )
        log.info('All expected output files found - process successful!\n')

    def extract_raw_ccs(self, inputFile):
        outputFile = self.process_setup( inputFile, 
                                         'extractCcsFromBasH5',
                                         suffix='fastq' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        elif file_has_ccs( inputFile ):
            extract_ccs(inputFile, outputFile, self.raw_data)
        else:
            msg = 'Raw data file has no CCS data!'
            log.error( msg )
            raise ValueError( msg )
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def filter_fastq(self, fastqFile):
        outputFile = self.process_setup( fastqFile, 
                                         'FilterQuality',
                                         suffix='filter.fastq' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        quality_filter( fastqFile, outputFile, min_accuracy=self.min_accuracy )
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def separate_fastq(self, fastqFile):
        outputList = self.process_setup( fastqFile, 
                                        'Fastq.Info', 
                                        suffixList=['fasta', 'qual'] )
        if self.output_files_exist(output_list=outputList):
            return outputList
        mothurArgs = {'fastq':fastqFile, 'fasta':'T', 'qfile':'T'}
        logFile = self.getProcessLogFile('fastq.info', True)
        self.factory.runJob('fastq.info', mothurArgs, logFile)
        self.process_cleanup(output_list=outputList)
        return outputList

    def align_sequences(self, fastaFile):
        outputFile = self.process_setup( fastaFile, 
                                        'Align.Seqs', 
                                        suffix='align' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta':fastaFile,
                      'reference':self.alignment_reference,
                      'flip':'t'}
        logFile = self.getProcessLogFile('align.seqs', True)
        self.factory.runJob('align.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def screen_sequences(self, alignFile, start=None, end=None, min_length=None):
        if alignFile.endswith('.align'):
            outputExt = 'good.align'
        elif alignFile.endswith('.fasta'):
            outputExt = 'good.fasta'
        outputFile = self.process_setup( alignFile, 
                                         'Screen.Seqs', 
                                         suffix=outputExt )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'start':start,
                      'end':end,
                      'minlength':min_length}
        logFile = self.getProcessLogFile('screen.seqs', True)
        self.factory.runJob('screen.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def summarize_sequences(self, fastaFile):
        outputFile = self.process_setup( fastaFile, 
                                        'Summary.Seqs', 
                                        suffix='summary' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta':fastaFile}
        logFile = self.getProcessLogFile('summary.seqs', True)
        self.factory.runJob('summary.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def parse_summary_file(self, summaryFile):
        log.info('Preparing to run SummaryReader...')
        parser = SummaryReader(summaryFile, self.fraction)
        log.info('Identifying full-length alignment positions...')
        start, end = parser.getFullLengthPositions()
        log.info('Full-length start is NAST Alignment position %s' % start)
        log.info('Full-length end is NAST Alignment position %s' % end)
        log.info('Calculating minimum allowed alignment positions...')
        maxStart, minEnd = parser.getAllowedPositions()
        log.info('Maximum allowed start is NAST Alignment position %s' % maxStart)
        log.info('Minimum allowed end is NAST Alignment position %s\n' % minEnd)
        return maxStart, minEnd

    def find_chimeras(self, alignFile):
        outputFile = self.process_setup( alignFile, 
                                        'UCHIME', 
                                        suffix='uchime.accnos' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'reference':self.chimera_reference}
        logFile = self.getProcessLogFile('chimera.uchime', True)
        self.factory.runJob('chimera.uchime', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def remove_sequences(self, alignFile, idFile):
        outputFile = self.process_setup( alignFile, 
                                        'Remove.Seqs', 
                                        suffix='pick.align' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'accnos':idFile}
        logFile = self.getProcessLogFile('remove.seqs', True)
        self.factory.runJob('remove.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile
  
    def filter_sequences(self, alignFile, trump=None ):
        outputFile = self.process_setup( alignFile, 
                                        'Filter.Seqs', 
                                        suffix='filter.fasta' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta': alignFile,
                      'vertical': 'T',
                      'trump': trump}
        logFile = self.getProcessLogFile( 'filter.seqs', True )
        self.factory.runJob( 'filter.seqs', mothurArgs, logFile )
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def add_quality_to_alignment(self, fastqFile, alignFile):
        outputFile = self.process_setup( alignFile, 
                                        'QualityAligner', 
                                        suffix='fastq' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        aligner = QualityAligner( fastqFile, alignFile, outputFile )
        aligner.run()
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def mask_fastq_sequences(self, fastqFile):
        outputFile = self.process_setup( fastqFile, 
                                        'QualityMasker', 
                                        suffix='masked.fastq' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        masker = QualityMasker(fastqFile, outputFile, self.minQv)
        masker.run()
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def unique_sequences( self, alignFile ):
        if alignFile.endswith('.align'):
            outputSuffixes = ['unique.align', 'names']
        elif alignFile.endswith('.fasta'):
            outputSuffixes = ['unique.fasta', 'names']
        outputList = self.process_setup( alignFile,
                                        'Unique.Seqs',
                                        suffixList=outputSuffixes )
        if self.output_files_exist(output_list=outputList):
            return outputList
        mothurArgs = {'fasta':alignFile}
        logFile = self.getProcessLogFile('unique.seqs', True)
        self.factory.runJob('unique.seqs', mothurArgs, logFile)
        self.process_cleanup(output_list=outputList)
        return outputList

    def precluster_sequences( self, alignFile, nameFile ):
        if alignFile.endswith('.align'):
            outputSuffixes = ['precluster.align', 'precluster.names']
        elif alignFile.endswith('.fasta'):
            outputSuffixes = ['precluster.fasta', 'precluster.names']
        outputList = self.process_setup( alignFile,
                                        'Pre.Cluster',
                                        suffixList=outputSuffixes )
        if self.output_files_exist(output_list=outputList):
            return outputList
        mothurArgs = { 'fasta':alignFile,
                       'name': nameFile,
                       'diffs':self.precluster_diffs }
        logFile = self.getProcessLogFile('pre.cluster', True)
        self.factory.runJob('pre.cluster', mothurArgs, logFile)
        self.process_cleanup(output_list=outputList)
        return outputList

    def calculate_distance_matrix( self, alignFile ):
        outputFile = self.process_setup( alignFile,
                                        'Dist.Seqs', 
                                        suffix='phylip.dist')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = { 'fasta':alignFile,
                       'calc':'onegap',
                       'countends':'F',
                       'output':'lt' }
        logFile = self.getProcessLogFile('dist.seqs', True)
        self.factory.runJob('dist.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def cluster_sequences(self, distanceMatrix, nameFile ):
        if self.clusteringMethod == 'nearest':
            outputSuffix = 'nn.list'
        elif self.clusteringMethod == 'average':
            outputSuffix = 'an.list'
        elif self.clusteringMethod == 'furthest':
            outputSuffix = 'fn.list'
        outputFile = self.process_setup( distanceMatrix,
                                        'Cluster', 
                                        suffix=outputSuffix )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'phylip':distanceMatrix,
                      'name':nameFile,
                      'method':self.clusteringMethod}
        logFile = self.getProcessLogFile( 'cluster', True )
        self.factory.runJob( 'cluster', mothurArgs, logFile )
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def separate_cluster_sequences(self, listFile, sequenceFile, distance, min_cluster_size):
        outputFile = self.process_setup( listFile,
                                        'ClusterSeparator', 
                                        suffix='clusters')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        outputDir = 'Dist_%s' % distance
        separator = ClusterSeparator( listFile,
                                      sequenceFile,
                                      outputFile,
                                      outputDir,
                                      distance,
                                      min_cluster_size )
        separator()
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def generate_consensus_sequences(self, cluster_list_file, distance):
        output_file = self.process_setup( cluster_list_file,
                                        'ClusterResequencer', 
                                        suffix='consensus')
        if self.output_files_exist(output_file=output_file):
            return output_file
        generate_consensus_files( cluster_list_file, self.consensusTool, output_file )
        self.process_cleanup(output_file=output_file)
        return output_file

    def generate_ref_sequences(self, cluster_list_file, distance):
        output_file = self.process_setup( cluster_list_file,
                                        'ClusterResequencer',
                                        suffix='consensus')
        if self.output_files_exist(output_file=output_file):
            return output_file
        generate_reference_files( cluster_list_file, output_file )
        self.process_cleanup(output_file=output_file)
        return output_file

    def cleanup_uchime_output( self, screenedFile ):
        outputFile = self.process_setup( screenedFile,
                                         'UchimeCleanup',
                                         suffix='uchime.cleanup' )
        uchimePath = os.getcwd()
        for filename in os.listdir( uchimePath ):
            if filename.endswith('_formatted'):
                file_path = os.path.join( uchimePath, filename )
                os.remove( file_path )
        write_dummy_file( outputFile )
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def cleanup_consensus_folder( self, consensusFile, distance ):
        outputFile = self.process_setup( consensusFile, 
                                        'ConsensusCleanup', 
                                        suffix='consensus.cleanup' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        reseqPath = os.path.join( os.getcwd(), 'Dist_%s' % distance )
        clean_consensus_outputs( reseqPath, outputFile )
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def select_sequences( self, consensusFile ):
        outputFile = self.process_setup( consensusFile,
                                        'SequenceSelector', 
                                        suffix='consensus.selected' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        select_consensus_files( consensusFile, outputFile )
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def select_ref_sequences( self, consensusFile ):
        outputFile = self.process_setup( consensusFile,
                                        'SequenceSelector',
                                        suffix='consensus.selected' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        select_reference_files( consensusFile, outputFile )
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def output_selected_sequences( self, selectedSequences ):
        outputFile = self.process_setup( selectedSequences,
                                        'SequenceWriter',
                                        suffix='fasta' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        copy_fasta_list( selectedSequences, outputFile )
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def write_name_file(self, consensusFile, selectedFile, outputRoot=None ):
        if outputRoot is None:
            outputRoot = 'Final_Output.fasta'
        outputFile = self.process_setup( outputRoot,
                                        'CreateNameFile',
                                        suffix='names' )
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        create_name_file( consensusFile, selectedFile, outputFile )
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def run(self):
        print self.step_list
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs( self.sequenceFile )
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile

        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq( fastqFile )
            fastaFile, qualFile = self.separate_fastq( filteredFastq )

        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences( fastaFile )
        summaryFile = self.summarize_sequences( alignedFile )
        maxStart, minEnd = self.parse_summary_file( summaryFile )
        screenedFile = self.screen_sequences(alignedFile,
                                             start=maxStart,
                                             end=minEnd)

        # Identify and remove chimeric reads
        chimera_ids = self.find_chimeras( screenedFile )
        self.cleanup_uchime_output( screenedFile )
        if file_exists( chimera_ids ):
            no_chimera_file = self.remove_sequences( screenedFile, chimera_ids )
        else:
            no_chimera_file = screenedFile

        filteredFile = self.filter_sequences( no_chimera_file, trump='.' )
        uniqueFile, nameFile = self.unique_sequences( filteredFile )
        preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile )
        fileToCluster = preclusteredFile

        clusterFileRoot = '.'.join( fileToCluster.split('.')[:-1] )
        for i, step in enumerate( self.step_list ):
            log.info("Beginning iteration #%s - %s" % (i+1, step))
            iterationInput = clusterFileRoot + '.%s.fasta' % step
            shutil.copyfile( fileToCluster, iterationInput )
            distanceMatrix = self.calculate_distance_matrix( iterationInput )
            listFile = self.cluster_sequences( distanceMatrix, nameFile )

            # Include all clusters during intermediate stages, others use min_cluster_size
            if step == self.distance:
                clusterListFile = self.separate_cluster_sequences( listFile, fastqFile,
                                                                   step, self.min_cluster_size )
            else:
                clusterListFile = self.separate_cluster_sequences( listFile, fastqFile,
                                                                   step, 1 )

            # Generate the consensus sequences for the next round
            if step == self.distance and self.enable_consensus:
                # If consensus is enabled and this is the last round, generate a GCON consensus
                log.info("Generating consensus sequences for iteration #%s - %s" % (i+1, step))
                consensusFile = self.generate_consensus_sequences( clusterListFile, step )
                self.cleanup_consensus_folder( consensusFile, step )
                selectedFile = self.select_sequences( consensusFile )
                selectedSequenceFile = self.output_selected_sequences( selectedFile )
            else:
                # Otherwise generate reference sequences by picking high-QV reads
                log.info("Selecting reference sequences for iteration #%s - %s" % (i+1, step))
                consensusFile = self.generate_ref_sequences( clusterListFile, step )
                selectedFile = self.select_ref_sequences( consensusFile )
                selectedSequenceFile = self.output_selected_sequences( selectedFile )

            # Whichever method was used, we need to update the nameFile accordingly
            nameFile = self.write_name_file( consensusFile, selectedFile, selectedSequenceFile )

            # If this isn't the last round, we must re-align and re-filter the new consensus sequences
            if step != self.distance:
                log.info("Iterative clustering not finished, preparing sequences for next iteration")
                alignedFile = self.align_sequences( selectedSequenceFile )
                fileToCluster = self.filter_sequences( alignedFile, trump='.' )
            log.info("Finished iteration #%s - %s" % (i+1, step))

        try:
            os.symlink( selectedSequenceFile, "Final_Output.fasta")
        except:
            pass

        try:
            os.symlink( nameFile, "Final_Output.names")
        except:
            pass
예제 #6
0
class rDnaPipeline( object ):
    """
    A tool for running a community analysis pipeline on PacBioData
    """

    ##########################
    # Initialization Methods #
    ##########################

    def __init__(self, sequenceFile=None):
        if sequenceFile is None:
            self.initializeFromArgs()
        else:
            self.initializeFromCall(sequenceFile)
        self.validateSettings()
        self.initializeOutput()
        self.initializeLogger()

    def initializeFromArgs(self):
        import argparse
        desc = 'A pipeline tool for analyzing rRNA amplicons'
        parser = argparse.ArgumentParser(description=desc)
        parser.add_argument('sequenceFile', metavar='FILE',
                            help="File of rRNA sequencing data to use")
        parser.add_argument('-a', '--minimum_accuracy', type=float, metavar='FLOAT',
                            dest='minAccuracy', default=MIN_ACCURACY,
                            help='Minimum predicted sequence accuracy')
        parser.add_argument('-d', '--distance', metavar='FLOAT', 
                            type=float, default=0.03,
                            help="Distance at which to cluster sequences")
        parser.add_argument('-n', '--num_processes', metavar='INT',
                            default=1, dest='numProc', type=int,
                            help="Number of processors to use")
        parser.add_argument('-f', '--fraction', metavar='FLOAT', 
                            type=float, default=DEFAULT_FRAC,
                            help='Fraction of full-length to require of each read')
        parser.add_argument('-c', '--clustering_method', metavar='METHOD',
                            dest='clusteringMethod', default=DEFAULT_METHOD,
                            choices=CLUSTER_METHODS,
                            help="Distance algorithm to use in clustering")
        parser.add_argument('-o', '--output', dest='outputDir', metavar='DIR',
                            default='rna_pipeline_run',
                            help="Specify the output folder")
        parser.add_argument('-q', '--minimum_qv', type=int, metavar='INT', 
                            dest='minQv', default=MIN_QV,
                            help='Minimum QV to allow after sequence masking')
        parser.add_argument('-l', '--minimum_length', type=int, metavar='INT', 
                            dest='minLength', default=MIN_LENGTH,
                            help='Minimun length sequence to allow after masking')
        parser.add_argument('--precluster_diffs', type=int, metavar='INT',
                            dest='preclusterDiffs', default=PRECLUSTER_DIFFS,
                            help='Maximum number of differences to allow in pre-clustering')
        parser.add_argument('-r', '--minimum_ratio', type=float, metavar='FLOAT',
                            dest='minRatio', default=MIN_RATIO,
                            help='Minimum ratio of retained bases to allow after masking')
        parser.add_argument('-A', '--alignment_reference', metavar='REF',
                            default='silva.both.align', dest='alignmentRef',
                            help="Reference MSA for aligning query sequences")
        parser.add_argument('-C', '--chimera_reference', metavar='REF',
                            default='silva.gold.align', dest='chimeraRef',
                            help="Reference MSA for Chimera detection")
        parser.add_argument('--enable_masking', action='store_true',
                            dest='enableMasking',
                            help="Turn off the low-quality Masking step")
        parser.add_argument('--disable_clustering', action='store_false',
                            dest='enableClustering',
                            help="Turn off the Clustering and Resequencing steps")
        parser.add_argument('--disable_consensus', action='store_false',
                            dest='enableConsensus',
                            help="Turn off the Consensus step")
        parser.add_argument('--blasr', metavar='BLASR_PATH', 
                            help="Specify the path to the Blasr executable")
        parser.add_argument('--mothur', metavar='MOTHUR_PATH', default='mothur',
                            help="Specify the path to the Mothur executable")
        parser.add_argument('--debug', action='store_true',
                            help="Turn on DEBUG message logging")
        args = parser.parse_args()
        self.__dict__.update( vars(args) )

    def validateSettings(self):
        # Validate the input file
        root, ext = self.splitRootFromExt( self.sequenceFile )
        if ext in ['.bas.h5', '.fofn']:
            self.dataType = 'bash5'
        elif ext in ['.fq', '.fastq']:
            self.dataType = 'fastq'
        elif ext in ['.fa', '.fsa', '.fasta']:
            self.dataType = 'fasta'
            self.enableMasking = False
            self.enableConsensus = False
        else:
            raise TypeError('Sequence file must be a bas.h5 file, a ' + \
                            'fasta file, or a fofn of multiple such files')
        # If Clustering was disabled, also disable the consensus process
        if not self.enableClustering:
            self.enableConsensus = False
        # If Consensus is enabled, initialize the appropriate tool
        if self.enableConsensus:
            self.consensusTool = DagConRunner('gcon.py', 'r')
        # Searching for Mothur executable, and set the Mothur Process counter
        self.mothur = validateExecutable( self.mothur )
        self.processCount = 0
        # Validate numerical parameters
        validateInt( self.numProc, minValue=0 )
        validateFloat( self.distance, minValue=MIN_DIST, maxValue=MAX_DIST )

    def initializeOutput(self):
        # Create the Output directory
        createDirectory( self.outputDir )
        # Create a symbolic link from the data file to the output dir
        baseName = os.path.basename( self.sequenceFile )
        symlinkPath = os.path.join( self.outputDir, baseName )
        if os.path.exists( symlinkPath ):
            pass
        else:
            absPath = os.path.abspath( self.sequenceFile )
            os.symlink( absPath, symlinkPath )
        self.sequenceFile = baseName
        # Move into the Output directory and create Log directory and files
        os.chdir( self.outputDir )
        createDirectory( 'log' )
        stdoutLog = os.path.join('log', 'mothur_stdout.log')
        stderrLog = os.path.join('log', 'mothur_stderr.log')
        self.logFile = os.path.join('log', 'rna_pipeline.log')
        # Instantiate the MothurRunner object
        self.factory = MothurRunner( self.mothur, 
                                     self.numProc, 
                                     stdoutLog, 
                                     stderrLog)

    def initializeLogger(self):
        dateFormat = "%Y-%m-%d %I:%M:%S"
        self.log = logging.getLogger()
        if self.debug:
            self.log.setLevel( logging.DEBUG )
        else:
            self.log.setLevel( logging.INFO )
        # Initialize the LogHandler for the master log file
        logHandler = logging.FileHandler( self.logFile )
        lineFormat = "%(asctime)s %(levelname)s %(processName)s " + \
                     "%(funcName)s %(lineno)d %(message)s"
        logFormatter = logging.Formatter(fmt=lineFormat, datefmt=dateFormat)
        logHandler.setFormatter( logFormatter )
        self.log.addHandler( logHandler )
        # Initialize a LogHandler for STDOUT
        outHandler = logging.StreamHandler( stream=sys.stdout )
        outLineFormat = "%(asctime)s %(message)s"
        outFormatter = logging.Formatter(fmt=outLineFormat, datefmt=dateFormat)
        outHandler.setFormatter( outFormatter )
        self.log.addHandler( outHandler )
        # Record the initialization of the pipeline
        self.log.info("INFO logger initialized")
        self.log.debug("DEBUG logger initialized")
        self.log.info("Initializing RnaPipeline v%s" % __version__)
        self.log.debug("Using the following parameters:")
        for param, value in self.__dict__.iteritems():
            self.log.debug("\t%s = %s" % (param, value))
        self.log.info("Initialization of RnaPipeline completed\n")

    def getProcessLogFile(self, process, isMothurProcess=False):
        if isMothurProcess:
            logFile = 'process%02d.mothur.%s.logfile' % (self.processCount, 
                                                         process)
        else:
            logFile = 'process%02d.%s.logfile' % (self.processCount, process)
        return os.path.join('log', logFile)

    def processSetup(self, inputFile, processName, suffix=None, suffixList=None):
        """ 
        Return a tuple containing the output file and a boolean flag describing
        whether the output file already exists
        """
        self.log.info('Preparing to run %s on "%s"' % (processName, inputFile))
        self.processCount += 1
        if suffix:
            outputFile = self.predictOutputFile(inputFile, suffix)
            return outputFile
        elif suffixList:
            outputFiles = []
            for suffix in suffixList:
                outputFile = self.predictOutputFile( inputFile, suffix )
                outputFiles.append( outputFile )
            return outputFiles

    def outputFilesExist( self, outputFile=None, outputList=None ):
        if outputFile:
            if fileExists( outputFile ):
                self.log.info('Output files detected, skipping process...\n')
                return True
            else:
                self.log.info('Output files not found, running process...')
                return False
        elif outputList:
            if allFilesExist( outputList ):
                self.log.info('Output files detected, skipping process...\n')
                return True
            else:
                self.log.info('Output files not found, running process...')
                return False

    def checkOutputFile( self, outputFile ):
        if fileExists( outputFile ):
            self.log.info('Expected output "%s" found' % outputFile)
        else:
            msg = 'Expected output "%s" not found!' % outputFile
            self.log.info( msg )
            raise IOError( msg )

    def processCleanup(self, outputFile=None, outputList=None):
        """
        Log if the process successfully created it's output, and raise an
        error message if not
        """
        if outputFile:
            self.checkOutputFile( outputFile )
        elif outputList:
            for outputFile in outputList:
                self.checkOutputFile( outputFile )
        self.log.info('All expected output files found - process successful!\n')

    def writeDummyFile(self, dummyFile):
        with open(dummyFile, 'w') as handle:
            handle.write('DONE')
        return dummyFile

    def extractCcsFromBasH5(self, inputFile):
        outputFile = self.processSetup( inputFile, 
                                        'extractCcsFromBasH5', 
                                        suffix='fastq' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        extractor = BasH5Extractor( inputFile, outputFile )
        extractor.outputCcsFastq()
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def filterFastqFile(self, fastqFile):
        outputList = self.processSetup( fastqFile, 
                                        'FilterQuality', 
                                        suffix='filter.fastq' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        aligner = QualityFilter( fastqFile, outputFile, self. )
        aligner()
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def separateFastqFile(self, fastqFile):
        outputList = self.processSetup( fastqFile, 
                                        'Fastq.Info', 
                                        suffixList=['fasta', 'qual'] )
        if self.outputFilesExist( outputList=outputList ):
            return outputList
        mothurArgs = {'fastq':fastqFile, 'fasta':'T', 'qfile':'T'}
        logFile = self.getProcessLogFile('fastq.info', True)
        self.factory.runJob('fastq.info', mothurArgs, logFile)
        self.processCleanup( outputList=outputList )
        return outputList

    def alignSequences(self, fastaFile):
        outputFile = self.processSetup( fastaFile, 
                                        'Align.Seqs', 
                                        suffix='align' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':fastaFile,
                      'reference':self.alignmentRef,
                      'flip':'t'}
        logFile = self.getProcessLogFile('align.seqs', True)
        self.factory.runJob('align.seqs', mothurArgs, logFile)
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def screenSequences(self, alignFile, start=None, end=None, minLength=None):
        if alignFile.endswith('.align'):
            outputExt = 'good.align'
        elif alignFile.endswith('.fasta'):
            outputExt = 'good.fasta'
        outputFile = self.processSetup( alignFile, 
                                         'Screen.Seqs', 
                                         suffix=outputExt )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'start':start,
                      'end':end,
                      'minlength':minLength}
        logFile = self.getProcessLogFile('screen.seqs', True)
        self.factory.runJob('screen.seqs', mothurArgs, logFile)
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def summarizeSequences(self, fastaFile):
        outputFile = self.processSetup( fastaFile, 
                                        'Summary.Seqs', 
                                        suffix='summary' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':fastaFile}
        logFile = self.getProcessLogFile('summary.seqs', True)
        self.factory.runJob('summary.seqs', mothurArgs, logFile)
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def parseSummaryFile(self, summaryFile):
        self.log.info('Preparing to run SummaryReader...')
        parser = SummaryReader(summaryFile, self.fraction)
        self.log.info('Identifying full-length alignment positions...')
        start, end = parser.getFullLengthPositions()
        self.log.info('Full-length start is NAST Alignment position %s' % start)
        self.log.info('Full-length end is NAST Alignment position %s' % end)
        self.log.info('Calculating minimum allowed alignment positions...')
        maxStart, minEnd = parser.getAllowedPositions()
        self.log.info('Maximum allowed start is NAST Alignment position %s' % maxStart)
        self.log.info('Minimum allowed end is NAST Alignment position %s\n' % minEnd)
        return maxStart, minEnd

    def findChimeras(self, alignFile):
        outputFile = self.processSetup( alignFile, 
                                        'UCHIME', 
                                        suffix='uchime.accnos' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'reference':self.chimeraRef}
        logFile = self.getProcessLogFile('chimera.uchime', True)
        self.factory.runJob('chimera.uchime', mothurArgs, logFile)
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def removeSequences(self, alignFile, idFile):
        outputFile = self.processSetup( alignFile, 
                                        'Remove.Seqs', 
                                        suffix='pick.align' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'accnos':idFile}
        logFile = self.getProcessLogFile('remove.seqs', True)
        self.factory.runJob('remove.seqs', mothurArgs, logFile)
        self.processCleanup( outputFile=outputFile )
        return outputFile
  
    def filterSequences(self, alignFile):
        outputFile = self.processSetup( alignFile, 
                                        'Filter.Seqs', 
                                        suffix='filter.fasta' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'fasta':alignFile,
                      'vertical':'T'}
        logFile = self.getProcessLogFile( 'filter.seqs', True )
        self.factory.runJob( 'filter.seqs', mothurArgs, logFile )
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def addQualityToAlignment(self, fastqFile, alignFile):
        outputFile = self.processSetup( alignFile, 
                                        'QualityAligner', 
                                        suffix='fastq' )
        if self.outputFilesExist( outputFile=output ):
            return output
        aligner = QualityAligner( fastqFile, alignFile, outputFile )
        aligner.run()
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def maskFastqSequences(self, fastqFile):
        outputFile = self.processSetup( fastqFile, 
                                        'QualityMasker', 
                                        suffix='masked.fastq' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        masker = QualityMasker(fastqFile, outputFile, self.minQv)
        masker.run()
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def uniqueSequences( self, alignFile ):
        if alignFile.endswith('.align'):
            outputSuffixes = ['unique.align', 'names']
        elif alignFile.endswith('.fasta'):
            outputSuffixes = ['unique.fasta', 'names']
        outputList = self.processSetup( alignFile,
                                        'Unique.Seqs',
                                        suffixList=outputSuffixes )
        if self.outputFilesExist( outputList=outputList ):
            return outputList
        mothurArgs = {'fasta':alignFile}
        logFile = self.getProcessLogFile('unique.seqs', True)
        self.factory.runJob('unique.seqs', mothurArgs, logFile)
        self.processCleanup( outputList=outputList )
        return outputList

    def preclusterSequences( self, alignFile, nameFile ):
        if alignFile.endswith('.align'):
            outputSuffixes = ['precluster.align', 'precluster.names']
        elif alignFile.endswith('.fasta'):
            outputSuffixes = ['precluster.fasta', 'precluster.names']
        outputList = self.processSetup( alignFile,
                                        'Pre.Cluster',
                                        suffixList=outputSuffixes )
        if self.outputFilesExist( outputList=outputList ):
            return outputList
        mothurArgs = { 'fasta':alignFile,
                       'name': nameFile,
                       'diffs':self.preclusterDiffs }
        logFile = self.getProcessLogFile('pre.cluster', True)
        self.factory.runJob('pre.cluster', mothurArgs, logFile)
        self.processCleanup( outputList=outputList )
        return outputList

    def calculateDistanceMatrix( self, alignFile ):
        outputFile = self.processSetup( alignFile, 
                                        'Dist.Seqs', 
                                        suffix='phylip.dist' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        mothurArgs = { 'fasta':alignFile,
                       'calc':'nogaps',
                       'output':'lt' }
        logFile = self.getProcessLogFile('dist.seqs', True)
        self.factory.runJob('dist.seqs', mothurArgs, logFile)
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def clusterSequences(self, distanceMatrix):
        if self.clusteringMethod == 'nearest':
            outputSuffix = 'nn.list'
        elif self.clusteringMethod == 'average':
            outputSuffix = 'an.list'
        elif self.clusteringMethod == 'furthest':
            outputSuffix = 'fn.list'
        outputFile = self.processSetup( distanceMatrix, 
                                        'Cluster', 
                                        suffix=outputSuffix )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        mothurArgs = {'phylip':distanceMatrix,
                      'method':self.clusteringMethod}
        logFile = self.getProcessLogFile( 'cluster', True )
        self.factory.runJob( 'cluster', mothurArgs, logFile )
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def separateClusterSequences(self, listFile, sequenceFile):
        outputFile = self.processSetup( listFile, 
                                        'ClusterSeparator', 
                                        suffix='list.clusters')
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        separator = ClusterSeparator( listFile, 
                                      sequenceFile, 
                                      self.distance, 
                                      outputFile )
        separator()
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def generateConsensusSequences(self, clusterListFile):
        outputFile = self.processSetup( clusterListFile, 
                                        'ClusterResequencer', 
                                        suffix='consensus')
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        consensusFiles = []
        with open( clusterListFile ) as handle:
            for line in handle:
                sequenceFile, referenceFile = line.strip().split()
                if referenceFile.endswith('None'):
                    consensusFiles.append( (sequenceFile, 'None') )
                else:
                    consensus = self.consensusTool( sequenceFile, referenceFile )
                    consensusFiles.append( (referenceFile, consensus) )
        with open( outputFile, 'w' ) as handle:
            for filenamePair in consensusFiles:
                handle.write('%s\t%s\n' % filenamePair)
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def cleanupConsensusFolder( self, consensusFile ):
        outputFile = self.processSetup( consensusFile, 
                                        'ConsensusCleanup', 
                                        suffix='consensus.cleanup' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        reseqPath = os.path.join( os.getcwd(), 'reseq' )
        for filename in os.listdir( reseqPath ):
            filePath = os.path.join( reseqPath, filename )
            if filePath.endswith('_input.fa'):
                os.remove( filePath )
            elif filePath.endswith('_input.fa.aln'):
                os.remove( filePath )
            elif filePath.endswith('_input.fa.aln_unsorted'):
                os.remove( filePath )
        self.writeDummyFile( outputFile )
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def selectFinalSequences( self, consensusFile ):
        outputFile = self.processSetup( consensusFile, 
                                        'SequenceSelector', 
                                        suffix='consensus.selected' )
        if self.outputFilesExist( outputFile=outputFile ):
            return outputFile
        selectedFiles = []
        with open( consensusFile ) as handle:
            for line in handle:
                referenceFile, consensusFile = line.strip().split()
                if consensusFile.endswith('None'):
                    selectedFiles.append( referenceFile )
                elif fasta_count( consensusFile ) == 1:
                    selectedFiles.append( consensusFile )
                else:
                    selectedFiles.append( referenceFile )
        with open( outputFile, 'w' ) as handle:
            for filename in selectedFiles:
                handle.write(filename + '\n')
        self.processCleanup( outputFile=outputFile )
        return outputFile

    def __call__(self):
        if self.dataType == 'bash5':
            fastqFile = self.extractCcsFromBasH5( self.sequenceFile )
        elif self.dataType == 'fastq':
            fastqFile = self.sequenceFile
        elif self.dataType == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile
        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filterFastqFile( fastqFile )
            fastaFile, qualFile = self.separateFastqFile( fastqFile )
예제 #7
0
class rDnaPipeline(object):
    """
    A tool for running a community analysis pipeline on PacBioData
    """
    def __init__(self):
        parse_args()
        self.__dict__.update(vars(args))
        self.validate_settings()
        self.initialize_output()
        initialize_logger(log, log_file=self.log_file, debug=self.debug)

    def validate_settings(self):
        # Validate the input file
        root, ext = split_root_from_ext(self.input_file)
        if ext in ['.bas.h5', '.fofn']:
            self.data_type = 'bash5'
        elif ext in ['.fq', '.fastq']:
            self.data_type = 'fastq'
        elif ext in ['.fa', '.fsa', '.fasta']:
            self.data_type = 'fasta'
            self.enable_masking = False
            self.enable_consensus = False
        else:
            raise TypeError('Sequence file must be a bas.h5 file, a ' + \
                            'fasta file, or a fofn of multiple such files')
        # If Clustering was disabled, also disable the consensus process
        if not self.enable_clustering:
            self.enable_consensus = False
        # If Consensus is enabled, initialize the appropriate tool
        if self.enable_consensus:
            self.consensusTool = DagConRunner('gcon.py', 'r')
        # Searching for Mothur executable, and set the Mothur Process counter
        self.mothur = validate_executable(self.mothur)
        self.processCount = 0

    def initialize_output(self):
        # Create the Output directory
        create_directory(self.output_dir)
        # Create a symbolic link from the data file to the output dir
        baseName = os.path.basename(self.input_file)
        symlinkPath = os.path.join(self.output_dir, baseName)
        if os.path.exists(symlinkPath):
            pass
        else:
            absPath = os.path.abspath(self.input_file)
            os.symlink(absPath, symlinkPath)
        self.sequenceFile = baseName
        # Move into the Output directory and create Log directory and files
        os.chdir(self.output_dir)
        create_directory('log')
        stdoutLog = os.path.join('log', 'mothur_stdout.log')
        stderrLog = os.path.join('log', 'mothur_stderr.log')
        self.log_file = os.path.join('log', 'rna_pipeline.log')
        # Instantiate the MothurRunner object
        self.factory = MothurRunner(self.mothur, self.nproc, stdoutLog,
                                    stderrLog)

    def getProcessLogFile(self, process, isMothurProcess=False):
        if isMothurProcess:
            logFile = 'process%02d.mothur.%s.logfile' % (self.processCount,
                                                         process)
        else:
            logFile = 'process%02d.%s.logfile' % (self.processCount, process)
        return os.path.join('log', logFile)

    def process_setup(self,
                      inputFile,
                      processName,
                      suffix=None,
                      suffixList=None):
        """ 
        Return a tuple containing the output file and a boolean flag describing
        whether the output file already exists
        """
        log.info('Preparing to run %s on "%s"' % (processName, inputFile))
        self.processCount += 1
        if suffix:
            outputFile = get_output_name(inputFile, suffix)
            return outputFile
        elif suffixList:
            outputFiles = []
            for suffix in suffixList:
                outputFile = get_output_name(inputFile, suffix)
                outputFiles.append(outputFile)
            return outputFiles

    def output_files_exist(self, output_file=None, output_list=None):
        if output_file:
            if file_exists(output_file):
                log.info('Output files detected, skipping process...\n')
                return True
            else:
                log.info('Output files not found, running process...')
                return False
        elif output_list:
            if all_files_exist(output_list):
                log.info('Output files detected, skipping process...\n')
                return True
            else:
                log.info('Output files not found, running process...')
                return False

    def check_output_file(self, outputFile):
        if os.path.exists(outputFile):
            log.info('Expected output "%s" found' % outputFile)
        else:
            msg = 'Expected output "%s" not found!' % outputFile
            log.error(msg)
            raise IOError(msg)

    def process_cleanup(self, output_file=None, output_list=None):
        """
        Log if the process successfully created it's output, and raise an
        error message if not
        """
        if output_file:
            self.check_output_file(output_file)
        elif output_list:
            for output_file in output_list:
                self.check_output_file(output_file)
        log.info('All expected output files found - process successful!\n')

    def write_dummy_file(self, dummyFile):
        with open(dummyFile, 'w') as handle:
            handle.write('DONE')
        return dummyFile

    def extract_raw_ccs(self, inputFile):
        outputFile = self.process_setup(inputFile,
                                        'extractCcsFromBasH5',
                                        suffix='fastq')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        elif file_has_ccs(inputFile):
            extract_ccs(inputFile, outputFile, self.raw_data)
        else:
            msg = 'Raw data file has no CCS data!'
            log.error(msg)
            raise ValueError(msg)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def filter_fastq(self, fastqFile):
        outputFile = self.process_setup(fastqFile,
                                        'FilterQuality',
                                        suffix='filter.fastq')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        quality_filter(fastqFile, outputFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def separate_fastq(self, fastqFile):
        outputList = self.process_setup(fastqFile,
                                        'Fastq.Info',
                                        suffixList=['fasta', 'qual'])
        if self.output_files_exist(output_list=outputList):
            return outputList
        mothurArgs = {'fastq': fastqFile, 'fasta': 'T', 'qfile': 'T'}
        logFile = self.getProcessLogFile('fastq.info', True)
        self.factory.runJob('fastq.info', mothurArgs, logFile)
        self.process_cleanup(output_list=outputList)
        return outputList

    def align_sequences(self, fastaFile):
        outputFile = self.process_setup(fastaFile,
                                        'Align.Seqs',
                                        suffix='align')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {
            'fasta': fastaFile,
            'reference': self.alignment_reference,
            'flip': 't'
        }
        logFile = self.getProcessLogFile('align.seqs', True)
        self.factory.runJob('align.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def screen_sequences(self,
                         alignFile,
                         start=None,
                         end=None,
                         min_length=None):
        if alignFile.endswith('.align'):
            outputExt = 'good.align'
        elif alignFile.endswith('.fasta'):
            outputExt = 'good.fasta'
        outputFile = self.process_setup(alignFile,
                                        'Screen.Seqs',
                                        suffix=outputExt)
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {
            'fasta': alignFile,
            'start': start,
            'end': end,
            'minlength': min_length
        }
        logFile = self.getProcessLogFile('screen.seqs', True)
        self.factory.runJob('screen.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def summarize_sequences(self, fastaFile):
        outputFile = self.process_setup(fastaFile,
                                        'Summary.Seqs',
                                        suffix='summary')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta': fastaFile}
        logFile = self.getProcessLogFile('summary.seqs', True)
        self.factory.runJob('summary.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def parse_summary_file(self, summaryFile):
        log.info('Preparing to run SummaryReader...')
        parser = SummaryReader(summaryFile, self.fraction)
        log.info('Identifying full-length alignment positions...')
        start, end = parser.getFullLengthPositions()
        log.info('Full-length start is NAST Alignment position %s' % start)
        log.info('Full-length end is NAST Alignment position %s' % end)
        log.info('Calculating minimum allowed alignment positions...')
        maxStart, minEnd = parser.getAllowedPositions()
        log.info('Maximum allowed start is NAST Alignment position %s' %
                 maxStart)
        log.info('Minimum allowed end is NAST Alignment position %s\n' %
                 minEnd)
        return maxStart, minEnd

    def find_chimeras(self, alignFile):
        outputFile = self.process_setup(alignFile,
                                        'UCHIME',
                                        suffix='uchime.accnos')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta': alignFile, 'reference': self.chimera_reference}
        logFile = self.getProcessLogFile('chimera.uchime', True)
        self.factory.runJob('chimera.uchime', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def find_chimeras_denovo(self, alignFile, nameFile):
        outputFile = self.process_setup(alignFile,
                                        'UCHIME',
                                        suffix='uchime.accnos')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta': alignFile, 'name': nameFile}
        logFile = self.getProcessLogFile('chimera.uchime', True)
        self.factory.runJob('chimera.uchime', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def remove_sequences(self, alignFile, idFile):
        inputSuffix = alignFile.split('.')[-1]
        outputSuffix = 'pick.%s' % inputSuffix
        outputFile = self.process_setup(alignFile,
                                        'Remove.Seqs',
                                        suffix=outputSuffix)
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta': alignFile, 'accnos': idFile}
        logFile = self.getProcessLogFile('remove.seqs', True)
        self.factory.runJob('remove.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def filter_sequences(self, alignFile, trump=None):
        outputFile = self.process_setup(alignFile,
                                        'Filter.Seqs',
                                        suffix='filter.fasta')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {'fasta': alignFile, 'vertical': 'T', 'trump': trump}
        logFile = self.getProcessLogFile('filter.seqs', True)
        self.factory.runJob('filter.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def add_quality_to_alignment(self, fastqFile, alignFile):
        outputFile = self.process_setup(alignFile,
                                        'QualityAligner',
                                        suffix='fastq')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        aligner = QualityAligner(fastqFile, alignFile, outputFile)
        aligner.run()
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def mask_fastq_sequences(self, fastqFile):
        outputFile = self.process_setup(fastqFile,
                                        'QualityMasker',
                                        suffix='masked.fastq')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        masker = QualityMasker(fastqFile, outputFile, self.minQv)
        masker.run()
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def unique_sequences(self, alignFile):
        if alignFile.endswith('.align'):
            outputSuffixes = ['unique.align', 'names']
        elif alignFile.endswith('.fasta'):
            outputSuffixes = ['unique.fasta', 'names']
        outputList = self.process_setup(alignFile,
                                        'Unique.Seqs',
                                        suffixList=outputSuffixes)
        if self.output_files_exist(output_list=outputList):
            return outputList
        mothurArgs = {'fasta': alignFile}
        logFile = self.getProcessLogFile('unique.seqs', True)
        self.factory.runJob('unique.seqs', mothurArgs, logFile)
        self.process_cleanup(output_list=outputList)
        return outputList

    def precluster_sequences(self, alignFile, nameFile):
        if alignFile.endswith('.align'):
            outputSuffixes = ['precluster.align', 'precluster.names']
        elif alignFile.endswith('.fasta'):
            outputSuffixes = ['precluster.fasta', 'precluster.names']
        outputList = self.process_setup(alignFile,
                                        'Pre.Cluster',
                                        suffixList=outputSuffixes)
        if self.output_files_exist(output_list=outputList):
            return outputList
        mothurArgs = {
            'fasta': alignFile,
            'name': nameFile,
            'diffs': self.precluster_diffs
        }
        logFile = self.getProcessLogFile('pre.cluster', True)
        self.factory.runJob('pre.cluster', mothurArgs, logFile)
        self.process_cleanup(output_list=outputList)
        return outputList

    def calculate_distance_matrix(self, alignFile):
        outputFile = self.process_setup(alignFile,
                                        'Dist.Seqs',
                                        suffix='phylip.dist')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {
            'fasta': alignFile,
            'calc': 'onegap',
            'countends': 'F',
            'output': 'lt'
        }
        logFile = self.getProcessLogFile('dist.seqs', True)
        self.factory.runJob('dist.seqs', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def cluster_sequences(self, distanceMatrix, nameFile):
        if self.clusteringMethod == 'nearest':
            outputSuffix = 'nn.list'
        elif self.clusteringMethod == 'average':
            outputSuffix = 'an.list'
        elif self.clusteringMethod == 'furthest':
            outputSuffix = 'fn.list'
        outputFile = self.process_setup(distanceMatrix,
                                        'Cluster',
                                        suffix=outputSuffix)
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        mothurArgs = {
            'phylip': distanceMatrix,
            'name': nameFile,
            'method': self.clusteringMethod
        }
        logFile = self.getProcessLogFile('cluster', True)
        self.factory.runJob('cluster', mothurArgs, logFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def separate_cluster_sequences(self, listFile, sequenceFile):
        outputFile = self.process_setup(listFile,
                                        'ClusterSeparator',
                                        suffix='list.clusters')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        separator = ClusterSeparator(listFile, sequenceFile, outputFile,
                                     self.distance, self.min_cluster_size)
        separator()
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def generate_consensus_sequences(self, clusterListFile):
        outputFile = self.process_setup(clusterListFile,
                                        'ClusterResequencer',
                                        suffix='consensus')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        consensusFiles = []
        with open(clusterListFile) as handle:
            for line in handle:
                sequenceFile, referenceFile, count = line.strip().split()
                if referenceFile.endswith('None'):
                    consensusFiles.append((sequenceFile, 'None'))
                else:
                    root_name = os.path.basename(sequenceFile)
                    consensus = self.consensusTool(sequenceFile, referenceFile)
                    consensusFiles.append((referenceFile, consensus))
        with open(outputFile, 'w') as handle:
            for filenamePair in consensusFiles:
                handle.write('%s\t%s\n' % filenamePair)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def cleanup_uchime_output(self, screenedFile):
        outputFile = self.process_setup(screenedFile,
                                        'UchimeCleanup',
                                        suffix='uchime.cleanup')
        uchimePath = os.getcwd()
        for filename in os.listdir(uchimePath):
            if filename.endswith('_formatted'):
                file_path = os.path.join(uchimePath, filename)
                os.remove(file_path)
        self.write_dummy_file(outputFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def cleanup_consensus_folder(self, consensusFile):
        outputFile = self.process_setup(consensusFile,
                                        'ConsensusCleanup',
                                        suffix='consensus.cleanup')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        reseqPath = os.path.join(os.getcwd(), 'reseq')
        for filename in os.listdir(reseqPath):
            filePath = os.path.join(reseqPath, filename)
            if filePath.endswith('_input.fa'):
                os.remove(filePath)
            elif filePath.endswith('_input.fa.aln'):
                os.remove(filePath)
            elif filePath.endswith('_input.fa.aln_unsorted'):
                os.remove(filePath)
        self.write_dummy_file(outputFile)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def select_final_sequences(self, consensusFile):
        outputFile = self.process_setup(consensusFile,
                                        'SequenceSelector',
                                        suffix='consensus.selected')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        selectedFiles = []
        with open(consensusFile) as handle:
            for line in handle:
                referenceFile, consensusFile = line.strip().split()
                if consensusFile.endswith('None'):
                    pass
                elif fasta_count(consensusFile) == 1:
                    selectedFiles.append(consensusFile)
                else:
                    selectedFiles.append(referenceFile)
        with open(outputFile, 'w') as handle:
            for filename in selectedFiles:
                handle.write(filename + '\n')
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def output_final_sequences(self, finalSequenceList):
        outputFile = self.process_setup(finalSequenceList,
                                        'SequenceWriter',
                                        suffix='fasta')
        if self.output_files_exist(output_file=outputFile):
            return outputFile
        with FastaWriter(outputFile) as writer:
            with open(finalSequenceList) as handle:
                for line in handle:
                    sequenceFile = line.strip()
                    copy_fasta_sequences(sequenceFile, writer)
        self.process_cleanup(output_file=outputFile)
        return outputFile

    def run(self):
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs(self.sequenceFile)
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile

        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq(fastqFile)
            fastaFile, qualFile = self.separate_fastq(filteredFastq)

        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences(fastaFile)
        summaryFile = self.summarize_sequences(alignedFile)
        maxStart, minEnd = self.parse_summary_file(summaryFile)
        screenedFile = self.screen_sequences(alignedFile,
                                             start=maxStart,
                                             end=minEnd)
        #filteredFile = self.filter_sequences( screenedFile, trump='.' )
        filteredFile = self.filter_sequences(screenedFile)

        # If masking is enabled, create an aligned FASTQ, mask the
        # low-quality bases and remove over-masked reads
        if self.enable_masking:
            alignedFastqFile = self.add_quality_to_alignment(
                fastqFile, filteredFile)
            maskedFastq = self.mask_fastq_sequences(alignedFastqFile)
            maskedFasta = self.convert_fastq_to_fasta(maskedFastq)
            screenedFile = self.screen_sequences(maskedFasta,
                                                 min_length=self.min_length)
        # Otherwise if masking is disabled, we'll use unique-ify and
        #    pre-cluster our sequences
        else:
            uniqueFile, nameFile = self.unique_sequences(filteredFile)
            screenedFile, nameFile = self.precluster_sequences(
                uniqueFile, nameFile)

        # Identify and remove chimeric reads
        #chimera_ids = self.find_chimeras_denovo( screenedFile, nameFile )
        chimera_ids = self.find_chimeras(screenedFile)

        self.cleanup_uchime_output(screenedFile)
        if file_exists(chimera_ids):
            fileForClustering = self.remove_sequences(screenedFile,
                                                      chimera_ids)
        else:
            fileForClustering = screenedFile

        # If enabled, calculate sequence distances and cluster
        if self.enable_clustering:
            distanceMatrix = self.calculate_distance_matrix(fileForClustering)
            listFile = self.cluster_sequences(distanceMatrix, nameFile)

        # If enabled, generate a consensus for each cluster from above
        if self.enable_consensus:
            clusterListFile = self.separate_cluster_sequences(
                listFile, fastqFile)
            consensusFile = self.generate_consensus_sequences(clusterListFile)
            self.cleanup_consensus_folder(consensusFile)
            selectedFile = self.select_final_sequences(consensusFile)
            finalFile = self.output_final_sequences(selectedFile)