def do_resequencing(self, data_file, amp_analysis_records, min_length, whitelist_file): log.info('Resequencing supplied Amplicon Analysis sequences') output_dir = self.get_output_folder("amp_analysis_resequencing") # Convert the raw data file into a BaxH5 fofn for use downstream # and create appropriate reader for local access bash5 = get_bash5_reader(data_file) baxh5_file = os.path.join(self.output, 'baxh5.fofn') create_baxh5_fofn(data_file, baxh5_file) # Extract any consensus sequences associated with this barcode log.info('Identified {0} consensus sequences to resequence'.format( len(amp_analysis_records))) reference_file = os.path.join(output_dir, 'reference.fasta') write_records(amp_analysis_records, reference_file) # Resequence the selected consensus sequences with the selected ZMWs self.resequencer(baxh5_file, whitelist_file, reference_file, output=output_dir, min_length=min_length) log.info( "Finished resequencing supplied Amplicon Analysis sequences\n")
def __call__(self, amp_analysis, data_file, barcode_file, barcode_string=None, min_snr=None, min_length=None): log.info( "Beginning Amplicon Analysis resequencing workflow for {0}".format( amp_analysis)) # Pick or create a single file from AA and read it amp_analysis_file = get_input_file(amp_analysis) amp_analysis_records = list(AmpliconAnalysisReader(amp_analysis_file)) # Convert the raw data file into a BaxH5 fofn for use downstream # and create appropriate reader for local access bash5 = get_bash5_reader(data_file) baxh5_file = os.path.join(self.output, 'baxh5.fofn') create_baxh5_fofn(data_file, baxh5_file) # Create a Reader for the Barcode data and find the overlap with any # barcodes specified by the user bc_reader = get_barcode_reader(barcode_file) bc_list = get_barcodes(bc_reader, barcode_string) for i, bc in enumerate(bc_list): log.info('Resequencing Barcode {0} (#{1} of {2})'.format( bc, i + 1, len(bc_list))) output_dir = self.get_output_folder(bc) # Extract any consensus sequences associated with this barcode record_list = [r for r in amp_analysis_records if r.barcode == bc] log.info( 'Identified {0} consensus sequences for Barcode {1}'.format( len(record_list), bc)) filtered_records = [r for r in record_list if r.num_reads >= 20] unique_records = get_unique_records(filtered_records) fraction = 100 * round( len(unique_records) / float(len(record_list)), 3) log.info( '{0} of {1} ({2}%) consensus sequences passed all filters'. format(len(unique_records), len(record_list), fraction)) reference_file = os.path.join(output_dir, 'reference.fasta') write_records(unique_records, reference_file) # Identify all high-quality, barcode-specific ZMWs and write them to file zmw_list = get_barcode_zmws(bc_reader, bc) zmw_list = filter_zmw_list(bash5, zmw_list, min_snr=min_snr) whitelist_file = os.path.join(output_dir, 'whitelist.txt') write_zmw_whitelist(bash5, zmw_list, whitelist_file) # Resequence the selected consensus sequences with the selected ZMWs self.resequencer(baxh5_file, whitelist_file, reference_file, output=output_dir, min_length=min_length) log.info("Finished resequencing Barcode {0}\n".format(bc))
def __call__(self, amp_analysis, data_file, barcode_file, barcode_string=None, min_snr=None, min_length=None): log.info("Beginning Amplicon Analysis resequencing workflow for {0}".format(amp_analysis)) # Pick or create a single file from AA and read it amp_analysis_file = get_input_file( amp_analysis ) amp_analysis_records = list(AmpliconAnalysisReader(amp_analysis_file)) # Convert the raw data file into a BaxH5 fofn for use downstream # and create appropriate reader for local access bash5 = get_bash5_reader( data_file ) baxh5_file = os.path.join( self.output, 'baxh5.fofn') create_baxh5_fofn( data_file, baxh5_file ) # Create a Reader for the Barcode data and find the overlap with any # barcodes specified by the user bc_reader = get_barcode_reader( barcode_file ) bc_list = get_barcodes( bc_reader, barcode_string ) for i, bc in enumerate( bc_list ): log.info('Resequencing Barcode {0} (#{1} of {2})'.format(bc, i+1, len(bc_list))) output_dir = self.get_output_folder( bc ) # Extract any consensus sequences associated with this barcode record_list = [r for r in amp_analysis_records if r.barcode == bc] log.info('Identified {0} consensus sequences for Barcode {1}'.format(len(record_list), bc)) filtered_records = [r for r in record_list if r.num_reads >= 20] unique_records = get_unique_records( filtered_records ) fraction = 100 * round(len(unique_records)/float(len(record_list)), 3) log.info('{0} of {1} ({2}%) consensus sequences passed all filters'.format(len(unique_records), len(record_list), fraction)) reference_file = os.path.join( output_dir, 'reference.fasta' ) write_records( unique_records, reference_file ) # Identify all high-quality, barcode-specific ZMWs and write them to file zmw_list = get_barcode_zmws( bc_reader, bc ) zmw_list = filter_zmw_list( bash5, zmw_list, min_snr=min_snr ) whitelist_file = os.path.join( output_dir, 'whitelist.txt' ) write_zmw_whitelist( bash5, zmw_list, whitelist_file ) # Resequence the selected consensus sequences with the selected ZMWs self.resequencer( baxh5_file, whitelist_file, reference_file, output=output_dir, min_length=min_length ) log.info("Finished resequencing Barcode {0}\n".format( bc ))
def do_resequencing(self, data_file, amp_analysis_records, min_length, whitelist_file): log.info("Resequencing supplied Amplicon Analysis sequences") output_dir = self.get_output_folder("amp_analysis_resequencing") # Convert the raw data file into a BaxH5 fofn for use downstream # and create appropriate reader for local access bash5 = get_bash5_reader(data_file) baxh5_file = os.path.join(self.output, "baxh5.fofn") create_baxh5_fofn(data_file, baxh5_file) # Extract any consensus sequences associated with this barcode log.info("Identified {0} consensus sequences to resequence".format(len(amp_analysis_records))) reference_file = os.path.join(output_dir, "reference.fasta") write_records(amp_analysis_records, reference_file) # Resequence the selected consensus sequences with the selected ZMWs self.resequencer(baxh5_file, whitelist_file, reference_file, output=output_dir, min_length=min_length) log.info("Finished resequencing supplied Amplicon Analysis sequences\n")
def cdna_from_file( input_file, hmm_fofn, output=None, reference=None, alignment=None ): """ Extract the cDNA sequences from a mixed Fasta or Fastq """ # Check the input files, and align the input file if needed alignment_file = get_alignment_file( input_file, reference, alignment ) output_file = output or get_output_file( input_file, 'cDNA' ) # Prepare the Fasta by orienting and subsetting it records = _parse_input_records( input_file ) hmms = parse_locus_dict( hmm_fofn ) loci = _parse_loci( alignment_file ) # Compose and output the records cdna_records = list( cdna_from_records( records, loci, hmms )) write_records( cdna_records, output_file ) return output_file
def do_barcoded_resequencing( self, data_file, amp_analysis_records, barcode_file, barcode_string, min_snr, min_length ): # Convert the raw data file into a BaxH5 fofn for use downstream # and create appropriate reader for local access bash5 = get_bash5_reader(data_file) baxh5_file = os.path.join(self.output, "baxh5.fofn") create_baxh5_fofn(data_file, baxh5_file) # Create a Reader for the Barcode data and find the overlap with any # barcodes specified by the user bc_reader = get_barcode_reader(barcode_file) bc_list = get_barcodes(bc_reader, barcode_string) for i, bc in enumerate(bc_list): log.info("Resequencing Barcode {0} (#{1} of {2})".format(bc, i + 1, len(bc_list))) output_dir = self.get_output_folder(bc) # Extract any consensus sequences associated with this barcode record_list = [r for r in amp_analysis_records if r.barcode == bc] log.info("Identified {0} consensus sequences for Barcode {1}".format(len(record_list), bc)) filtered_records = [r for r in record_list if r.num_reads >= 20] unique_records = get_unique_records(filtered_records) fraction = 100 * round(len(unique_records) / float(len(record_list)), 3) log.info( "{0} of {1} ({2}%) consensus sequences passed all filters".format( len(unique_records), len(record_list), fraction ) ) reference_file = os.path.join(output_dir, "reference.fasta") write_records(unique_records, reference_file) # Identify all high-quality, barcode-specific ZMWs and write them to file zmw_list = get_barcode_zmws(bc_reader, bc) zmw_list = filter_zmw_list(bash5, zmw_list, min_snr=min_snr) whitelist_file = os.path.join(output_dir, "whitelist.txt") write_zmw_whitelist(bash5, zmw_list, whitelist_file) # Resequence the selected consensus sequences with the selected ZMWs self.resequencer(baxh5_file, whitelist_file, reference_file, output=output_dir, min_length=min_length) log.info("Finished resequencing Barcode {0}\n".format(bc))
def cdna_from_file(input_file, hmm_fofn, output=None, reference=None, alignment=None): """ Extract the cDNA sequences from a mixed Fasta or Fastq """ # Check the input files, and align the input file if needed alignment_file = get_alignment_file(input_file, reference, alignment) output_file = output or get_output_file(input_file, 'cDNA') # Prepare the Fasta by orienting and subsetting it records = _parse_input_records(input_file) hmms = parse_locus_dict(hmm_fofn) loci = _parse_loci(alignment_file) # Compose and output the records cdna_records = list(cdna_from_records(records, loci, hmms)) write_records(cdna_records, output_file) return output_file