def checkOutputFile( self, outputFile ): if file_exists( outputFile ): log.info('Expected output "%s" found' % outputFile) else: msg = 'Expected output "%s" not found!' % outputFile log.info( msg ) raise IOError( msg )
def output_files_exist(self, output_file=None, output_list=None): if output_file: if file_exists(output_file): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False elif output_list: if all_files_exist(output_list): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False
def output_files_exist( self, outputFile=None, outputList=None ): if outputFile: if file_exists( outputFile ): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False elif outputList: if all_files_exist( outputList ): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False
def run(self): if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs(self.sequenceFile) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq(fastqFile) fastaFile, qualFile = self.separate_fastq(filteredFastq) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences(fastaFile) summaryFile = self.summarize_sequences(alignedFile) maxStart, minEnd = self.parse_summary_file(summaryFile) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) # Identify and remove chimeric reads chimera_ids = self.find_chimeras(screenedFile) self.cleanup_uchime_output(screenedFile) if file_exists(chimera_ids): no_chimera_file = self.remove_sequences(screenedFile, chimera_ids) else: no_chimera_file = screenedFile # Filter out un-used columns to speed up re-alignment and clustering filteredFile = self.filter_sequences(no_chimera_file, trump='.') uniqueFile, nameFile = self.unique_sequences(filteredFile) preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile) fileForClustering = preclusteredFile distanceMatrix = self.calculate_distance_matrix(fileForClustering) listFile = self.cluster_sequences(distanceMatrix, nameFile) clusterListFile = self.separate_cluster_sequences(listFile, fastqFile) consensusFile = self.generate_consensus_sequences(clusterListFile) self.cleanup_consensus_folder(consensusFile) selectedFile = self.select_final_sequences(consensusFile) finalFile = self.output_final_sequences(selectedFile)
def run(self): if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs( self.sequenceFile ) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq( fastqFile ) fastaFile, qualFile = self.separate_fastq( filteredFastq ) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences( fastaFile ) summaryFile = self.summarize_sequences( alignedFile ) maxStart, minEnd = self.parse_summary_file( summaryFile ) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) # Identify and remove chimeric reads chimera_ids = self.find_chimeras( screenedFile ) self.cleanup_uchime_output( screenedFile ) if file_exists( chimera_ids ): no_chimera_file = self.remove_sequences( screenedFile, chimera_ids ) else: no_chimera_file = screenedFile # Filter out un-used columns to speed up re-alignment and clustering filteredFile = self.filter_sequences( no_chimera_file, trump='.' ) uniqueFile, nameFile = self.unique_sequences( filteredFile ) preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile ) fileForClustering = preclusteredFile distanceMatrix = self.calculate_distance_matrix( fileForClustering ) listFile = self.cluster_sequences( distanceMatrix, nameFile ) clusterListFile = self.separate_cluster_sequences( listFile, fastqFile ) consensusFile = self.generate_consensus_sequences( clusterListFile ) self.cleanup_consensus_folder( consensusFile ) selectedFile = self.select_final_sequences( consensusFile ) finalFile = self.output_final_sequences( selectedFile )
def run(self): print self.step_list if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs(self.sequenceFile) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq(fastqFile) fastaFile, qualFile = self.separate_fastq(filteredFastq) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences(fastaFile) summaryFile = self.summarize_sequences(alignedFile) maxStart, minEnd = self.parse_summary_file(summaryFile) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) # Identify and remove chimeric reads chimera_ids = self.find_chimeras(screenedFile) self.cleanup_uchime_output(screenedFile) if file_exists(chimera_ids): no_chimera_file = self.remove_sequences(screenedFile, chimera_ids) else: no_chimera_file = screenedFile filteredFile = self.filter_sequences(no_chimera_file, trump='.') uniqueFile, nameFile = self.unique_sequences(filteredFile) preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile) fileToCluster = preclusteredFile clusterFileRoot = '.'.join(fileToCluster.split('.')[:-1]) for i, step in enumerate(self.step_list): log.info("Beginning iteration #%s - %s" % (i + 1, step)) iterationInput = clusterFileRoot + '.%s.fasta' % step shutil.copyfile(fileToCluster, iterationInput) distanceMatrix = self.calculate_distance_matrix(iterationInput) listFile = self.cluster_sequences(distanceMatrix, nameFile) # Include all clusters during intermediate stages, others use min_cluster_size if step == self.distance: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, self.min_cluster_size) else: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, 1) # Generate the consensus sequences for the next round if step == self.distance and self.enable_consensus: # If consensus is enabled and this is the last round, generate a GCON consensus log.info( "Generating consensus sequences for iteration #%s - %s" % (i + 1, step)) consensusFile = self.generate_consensus_sequences( clusterListFile, step) self.cleanup_consensus_folder(consensusFile, step) selectedFile = self.select_sequences(consensusFile) selectedSequenceFile = self.output_selected_sequences( selectedFile) else: # Otherwise generate reference sequences by picking high-QV reads log.info( "Selecting reference sequences for iteration #%s - %s" % (i + 1, step)) consensusFile = self.generate_ref_sequences( clusterListFile, step) selectedFile = self.select_ref_sequences(consensusFile) selectedSequenceFile = self.output_selected_sequences( selectedFile) # Whichever method was used, we need to update the nameFile accordingly nameFile = self.write_name_file(consensusFile, selectedFile, selectedSequenceFile) # If this isn't the last round, we must re-align and re-filter the new consensus sequences if step != self.distance: log.info( "Iterative clustering not finished, preparing sequences for next iteration" ) alignedFile = self.align_sequences(selectedSequenceFile) fileToCluster = self.filter_sequences(alignedFile, trump='.') log.info("Finished iteration #%s - %s" % (i + 1, step)) try: os.symlink(selectedSequenceFile, "Final_Output.fasta") except: pass try: os.symlink(nameFile, "Final_Output.names") except: pass
def run(self): print self.step_list if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs( self.sequenceFile ) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq( fastqFile ) fastaFile, qualFile = self.separate_fastq( filteredFastq ) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences( fastaFile ) summaryFile = self.summarize_sequences( alignedFile ) maxStart, minEnd = self.parse_summary_file( summaryFile ) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) # Identify and remove chimeric reads chimera_ids = self.find_chimeras( screenedFile ) self.cleanup_uchime_output( screenedFile ) if file_exists( chimera_ids ): no_chimera_file = self.remove_sequences( screenedFile, chimera_ids ) else: no_chimera_file = screenedFile filteredFile = self.filter_sequences( no_chimera_file, trump='.' ) uniqueFile, nameFile = self.unique_sequences( filteredFile ) preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile ) fileToCluster = preclusteredFile clusterFileRoot = '.'.join( fileToCluster.split('.')[:-1] ) for i, step in enumerate( self.step_list ): log.info("Beginning iteration #%s - %s" % (i+1, step)) iterationInput = clusterFileRoot + '.%s.fasta' % step shutil.copyfile( fileToCluster, iterationInput ) distanceMatrix = self.calculate_distance_matrix( iterationInput ) listFile = self.cluster_sequences( distanceMatrix, nameFile ) # Include all clusters during intermediate stages, others use min_cluster_size if step == self.distance: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, self.min_cluster_size ) else: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, 1 ) # Generate the consensus sequences for the next round if step == self.distance and self.enable_consensus: # If consensus is enabled and this is the last round, generate a GCON consensus log.info("Generating consensus sequences for iteration #%s - %s" % (i+1, step)) consensusFile = self.generate_consensus_sequences( clusterListFile, step ) self.cleanup_consensus_folder( consensusFile, step ) selectedFile = self.select_sequences( consensusFile ) selectedSequenceFile = self.output_selected_sequences( selectedFile ) else: # Otherwise generate reference sequences by picking high-QV reads log.info("Selecting reference sequences for iteration #%s - %s" % (i+1, step)) consensusFile = self.generate_ref_sequences( clusterListFile, step ) selectedFile = self.select_ref_sequences( consensusFile ) selectedSequenceFile = self.output_selected_sequences( selectedFile ) # Whichever method was used, we need to update the nameFile accordingly nameFile = self.write_name_file( consensusFile, selectedFile, selectedSequenceFile ) # If this isn't the last round, we must re-align and re-filter the new consensus sequences if step != self.distance: log.info("Iterative clustering not finished, preparing sequences for next iteration") alignedFile = self.align_sequences( selectedSequenceFile ) fileToCluster = self.filter_sequences( alignedFile, trump='.' ) log.info("Finished iteration #%s - %s" % (i+1, step)) try: os.symlink( selectedSequenceFile, "Final_Output.fasta") except: pass try: os.symlink( nameFile, "Final_Output.names") except: pass
def run(self): print self.step_list if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs( self.sequenceFile ) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq( fastqFile ) fastaFile, qualFile = self.separate_fastq( filteredFastq ) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences( fastaFile ) summaryFile = self.summarize_sequences( alignedFile ) maxStart, minEnd = self.parse_summary_file( summaryFile ) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) # Identify and remove chimeric reads chimera_ids = self.find_chimeras( screenedFile ) self.cleanup_uchime_output( screenedFile ) if file_exists( chimera_ids ): no_chimera_file = self.remove_sequences( screenedFile, chimera_ids ) else: no_chimera_file = screenedFile filteredFile = self.filter_sequences( no_chimera_file, trump='.' ) uniqueFile, nameFile = self.unique_sequences( filteredFile ) preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile ) fileToCluster = preclusteredFile clusterFileRoot = '.'.join( fileToCluster.split('.')[:-1] ) for i, step in enumerate([0.01, 0.03]): log.info("Beginning iteration #%s - %s" % (i+1, step)) iterationInput = clusterFileRoot + '.%s.fasta' % step shutil.copyfile( fileToCluster, iterationInput ) distanceMatrix = self.calculate_distance_matrix( iterationInput ) listFile = self.cluster_sequences( distanceMatrix, nameFile ) # Include all clusters during intermediate stages, others use min_cluster_size if step == self.distance: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, self.min_cluster_size ) else: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, 1 ) # Generate and combine cluster sequences from the cluster-specific files consensusFile = self.generate_consensus_sequences( clusterListFile, step ) self.cleanup_consensus_folder( consensusFile, step ) selectedFile = self.select_sequences( consensusFile ) selectedSequenceFile = self.output_selected_sequences( selectedFile ) log.info("Finished iteration #%s - %s" % (i+1, step)) # If this isn't the last iteration, prepare the selected sequences for the next one: if step == self.distance: log.info("Iterative clustering finished") else: log.info("Iterative clustering not finished, preparing sequences for next iteration") alignedFile = self.align_sequences( selectedSequenceFile ) fileToCluster = self.filter_sequences( alignedFile, trump='.' ) nameFile = self.write_name_file( consensusFile, selectedFile, selectedSequenceFile ) try: os.symlink( selectedSequenceFile, "Final_Output.fasta") except: pass self.write_name_file( consensusFile, selectedFile )
def run(self): print self.step_list if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs(self.sequenceFile) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq(fastqFile) fastaFile, qualFile = self.separate_fastq(filteredFastq) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences(fastaFile) summaryFile = self.summarize_sequences(alignedFile) maxStart, minEnd = self.parse_summary_file(summaryFile) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) # Identify and remove chimeric reads chimera_ids = self.find_chimeras(screenedFile) self.cleanup_uchime_output(screenedFile) if file_exists(chimera_ids): no_chimera_file = self.remove_sequences(screenedFile, chimera_ids) else: no_chimera_file = screenedFile filteredFile = self.filter_sequences(no_chimera_file, trump='.') uniqueFile, nameFile = self.unique_sequences(filteredFile) preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile) fileToCluster = preclusteredFile clusterFileRoot = '.'.join(fileToCluster.split('.')[:-1]) for i, step in enumerate([0.01, 0.03]): log.info("Beginning iteration #%s - %s" % (i + 1, step)) iterationInput = clusterFileRoot + '.%s.fasta' % step shutil.copyfile(fileToCluster, iterationInput) distanceMatrix = self.calculate_distance_matrix(iterationInput) listFile = self.cluster_sequences(distanceMatrix, nameFile) # Include all clusters during intermediate stages, others use min_cluster_size if step == self.distance: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, self.min_cluster_size) else: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, 1) # Generate and combine cluster sequences from the cluster-specific files consensusFile = self.generate_consensus_sequences( clusterListFile, step) self.cleanup_consensus_folder(consensusFile, step) selectedFile = self.select_sequences(consensusFile) selectedSequenceFile = self.output_selected_sequences(selectedFile) log.info("Finished iteration #%s - %s" % (i + 1, step)) # If this isn't the last iteration, prepare the selected sequences for the next one: if step == self.distance: log.info("Iterative clustering finished") else: log.info( "Iterative clustering not finished, preparing sequences for next iteration" ) alignedFile = self.align_sequences(selectedSequenceFile) fileToCluster = self.filter_sequences(alignedFile, trump='.') nameFile = self.write_name_file(consensusFile, selectedFile, selectedSequenceFile) try: os.symlink(selectedSequenceFile, "Final_Output.fasta") except: pass self.write_name_file(consensusFile, selectedFile)
def run(self): if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs(self.sequenceFile) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq(fastqFile) fastaFile, qualFile = self.separate_fastq(filteredFastq) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences(fastaFile) summaryFile = self.summarize_sequences(alignedFile) maxStart, minEnd = self.parse_summary_file(summaryFile) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) #filteredFile = self.filter_sequences( screenedFile, trump='.' ) filteredFile = self.filter_sequences(screenedFile) # If masking is enabled, create an aligned FASTQ, mask the # low-quality bases and remove over-masked reads if self.enable_masking: alignedFastqFile = self.add_quality_to_alignment( fastqFile, filteredFile) maskedFastq = self.mask_fastq_sequences(alignedFastqFile) maskedFasta = self.convert_fastq_to_fasta(maskedFastq) screenedFile = self.screen_sequences(maskedFasta, min_length=self.min_length) # Otherwise if masking is disabled, we'll use unique-ify and # pre-cluster our sequences else: uniqueFile, nameFile = self.unique_sequences(filteredFile) screenedFile, nameFile = self.precluster_sequences( uniqueFile, nameFile) # Identify and remove chimeric reads #chimera_ids = self.find_chimeras_denovo( screenedFile, nameFile ) chimera_ids = self.find_chimeras(screenedFile) self.cleanup_uchime_output(screenedFile) if file_exists(chimera_ids): fileForClustering = self.remove_sequences(screenedFile, chimera_ids) else: fileForClustering = screenedFile # If enabled, calculate sequence distances and cluster if self.enable_clustering: distanceMatrix = self.calculate_distance_matrix(fileForClustering) listFile = self.cluster_sequences(distanceMatrix, nameFile) # If enabled, generate a consensus for each cluster from above if self.enable_consensus: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile) consensusFile = self.generate_consensus_sequences(clusterListFile) self.cleanup_consensus_folder(consensusFile) selectedFile = self.select_final_sequences(consensusFile) finalFile = self.output_final_sequences(selectedFile)