Пример #1
0
    def do_resequencing(self, data_file, amp_analysis_records, min_length,
                        whitelist_file):
        log.info('Resequencing supplied Amplicon Analysis sequences')
        output_dir = self.get_output_folder("amp_analysis_resequencing")

        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader(data_file)
        baxh5_file = os.path.join(self.output, 'baxh5.fofn')
        create_baxh5_fofn(data_file, baxh5_file)

        # Extract any consensus sequences associated with this barcode
        log.info('Identified {0} consensus sequences to resequence'.format(
            len(amp_analysis_records)))
        reference_file = os.path.join(output_dir, 'reference.fasta')
        write_records(amp_analysis_records, reference_file)

        # Resequence the selected consensus sequences with the selected ZMWs
        self.resequencer(baxh5_file,
                         whitelist_file,
                         reference_file,
                         output=output_dir,
                         min_length=min_length)
        log.info(
            "Finished resequencing supplied Amplicon Analysis sequences\n")
Пример #2
0
    def __call__(self,
                 amp_analysis,
                 data_file,
                 barcode_file,
                 barcode_string=None,
                 min_snr=None,
                 min_length=None):
        log.info(
            "Beginning Amplicon Analysis resequencing workflow for {0}".format(
                amp_analysis))

        # Pick or create a single file from AA and read it
        amp_analysis_file = get_input_file(amp_analysis)
        amp_analysis_records = list(AmpliconAnalysisReader(amp_analysis_file))

        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader(data_file)
        baxh5_file = os.path.join(self.output, 'baxh5.fofn')
        create_baxh5_fofn(data_file, baxh5_file)

        # Create a Reader for the Barcode data and find the overlap with any
        # barcodes specified by the user
        bc_reader = get_barcode_reader(barcode_file)
        bc_list = get_barcodes(bc_reader, barcode_string)

        for i, bc in enumerate(bc_list):
            log.info('Resequencing Barcode {0} (#{1} of {2})'.format(
                bc, i + 1, len(bc_list)))
            output_dir = self.get_output_folder(bc)

            # Extract any consensus sequences associated with this barcode
            record_list = [r for r in amp_analysis_records if r.barcode == bc]
            log.info(
                'Identified {0} consensus sequences for Barcode {1}'.format(
                    len(record_list), bc))
            filtered_records = [r for r in record_list if r.num_reads >= 20]
            unique_records = get_unique_records(filtered_records)
            fraction = 100 * round(
                len(unique_records) / float(len(record_list)), 3)
            log.info(
                '{0} of {1} ({2}%) consensus sequences passed all filters'.
                format(len(unique_records), len(record_list), fraction))
            reference_file = os.path.join(output_dir, 'reference.fasta')
            write_records(unique_records, reference_file)

            # Identify all high-quality, barcode-specific ZMWs and write them to file
            zmw_list = get_barcode_zmws(bc_reader, bc)
            zmw_list = filter_zmw_list(bash5, zmw_list, min_snr=min_snr)
            whitelist_file = os.path.join(output_dir, 'whitelist.txt')
            write_zmw_whitelist(bash5, zmw_list, whitelist_file)

            # Resequence the selected consensus sequences with the selected ZMWs
            self.resequencer(baxh5_file,
                             whitelist_file,
                             reference_file,
                             output=output_dir,
                             min_length=min_length)

            log.info("Finished resequencing Barcode {0}\n".format(bc))
Пример #3
0
    def __call__(self, amp_analysis, data_file, barcode_file, barcode_string=None, min_snr=None, min_length=None):
        log.info("Beginning Amplicon Analysis resequencing workflow for {0}".format(amp_analysis))

        # Pick or create a single file from AA and read it
        amp_analysis_file = get_input_file( amp_analysis )
        amp_analysis_records = list(AmpliconAnalysisReader(amp_analysis_file))

        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader( data_file )
        baxh5_file = os.path.join( self.output, 'baxh5.fofn')
        create_baxh5_fofn( data_file, baxh5_file )

        # Create a Reader for the Barcode data and find the overlap with any
        # barcodes specified by the user
        bc_reader = get_barcode_reader( barcode_file )
        bc_list = get_barcodes( bc_reader, barcode_string )

        for i, bc in enumerate( bc_list ):
            log.info('Resequencing Barcode {0} (#{1} of {2})'.format(bc, i+1, len(bc_list)))
            output_dir = self.get_output_folder( bc )

            # Extract any consensus sequences associated with this barcode
            record_list = [r for r in amp_analysis_records if r.barcode == bc]
            log.info('Identified {0} consensus sequences for Barcode {1}'.format(len(record_list), bc))
            filtered_records = [r for r in record_list if r.num_reads >= 20]
            unique_records = get_unique_records( filtered_records )
            fraction = 100 * round(len(unique_records)/float(len(record_list)), 3)
            log.info('{0} of {1} ({2}%) consensus sequences passed all filters'.format(len(unique_records),
                                                                                       len(record_list),
                                                                                       fraction))
            reference_file = os.path.join( output_dir, 'reference.fasta' )
            write_records( unique_records, reference_file )

            # Identify all high-quality, barcode-specific ZMWs and write them to file
            zmw_list = get_barcode_zmws( bc_reader, bc )
            zmw_list = filter_zmw_list( bash5, zmw_list, min_snr=min_snr )
            whitelist_file = os.path.join( output_dir, 'whitelist.txt' )
            write_zmw_whitelist( bash5, zmw_list, whitelist_file )

            # Resequence the selected consensus sequences with the selected ZMWs
            self.resequencer( baxh5_file,
                              whitelist_file,
                              reference_file,
                              output=output_dir,
                              min_length=min_length )

            log.info("Finished resequencing Barcode {0}\n".format( bc ))
Пример #4
0
    def do_resequencing(self, data_file, amp_analysis_records, min_length, whitelist_file):
        log.info("Resequencing supplied Amplicon Analysis sequences")
        output_dir = self.get_output_folder("amp_analysis_resequencing")

        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader(data_file)
        baxh5_file = os.path.join(self.output, "baxh5.fofn")
        create_baxh5_fofn(data_file, baxh5_file)

        # Extract any consensus sequences associated with this barcode
        log.info("Identified {0} consensus sequences to resequence".format(len(amp_analysis_records)))
        reference_file = os.path.join(output_dir, "reference.fasta")
        write_records(amp_analysis_records, reference_file)

        # Resequence the selected consensus sequences with the selected ZMWs
        self.resequencer(baxh5_file, whitelist_file, reference_file, output=output_dir, min_length=min_length)
        log.info("Finished resequencing supplied Amplicon Analysis sequences\n")
Пример #5
0
def cdna_from_file( input_file, hmm_fofn, output=None,
                                          reference=None,
                                          alignment=None ):
    """
    Extract the cDNA sequences from a mixed Fasta or Fastq
    """
    # Check the input files, and align the input file if needed
    alignment_file = get_alignment_file( input_file, reference, alignment )
    output_file = output or get_output_file( input_file, 'cDNA' )

    # Prepare the Fasta by orienting and subsetting it
    records = _parse_input_records( input_file )
    hmms = parse_locus_dict( hmm_fofn )
    loci = _parse_loci( alignment_file )

    # Compose and output the records
    cdna_records = list( cdna_from_records( records, loci, hmms ))
    write_records( cdna_records, output_file )
    return output_file
Пример #6
0
    def do_barcoded_resequencing(
        self, data_file, amp_analysis_records, barcode_file, barcode_string, min_snr, min_length
    ):
        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader(data_file)
        baxh5_file = os.path.join(self.output, "baxh5.fofn")
        create_baxh5_fofn(data_file, baxh5_file)

        # Create a Reader for the Barcode data and find the overlap with any
        # barcodes specified by the user
        bc_reader = get_barcode_reader(barcode_file)
        bc_list = get_barcodes(bc_reader, barcode_string)

        for i, bc in enumerate(bc_list):
            log.info("Resequencing Barcode {0} (#{1} of {2})".format(bc, i + 1, len(bc_list)))
            output_dir = self.get_output_folder(bc)

            # Extract any consensus sequences associated with this barcode
            record_list = [r for r in amp_analysis_records if r.barcode == bc]
            log.info("Identified {0} consensus sequences for Barcode {1}".format(len(record_list), bc))
            filtered_records = [r for r in record_list if r.num_reads >= 20]
            unique_records = get_unique_records(filtered_records)
            fraction = 100 * round(len(unique_records) / float(len(record_list)), 3)
            log.info(
                "{0} of {1} ({2}%) consensus sequences passed all filters".format(
                    len(unique_records), len(record_list), fraction
                )
            )
            reference_file = os.path.join(output_dir, "reference.fasta")
            write_records(unique_records, reference_file)

            # Identify all high-quality, barcode-specific ZMWs and write them to file
            zmw_list = get_barcode_zmws(bc_reader, bc)
            zmw_list = filter_zmw_list(bash5, zmw_list, min_snr=min_snr)
            whitelist_file = os.path.join(output_dir, "whitelist.txt")
            write_zmw_whitelist(bash5, zmw_list, whitelist_file)

            # Resequence the selected consensus sequences with the selected ZMWs
            self.resequencer(baxh5_file, whitelist_file, reference_file, output=output_dir, min_length=min_length)

            log.info("Finished resequencing Barcode {0}\n".format(bc))
Пример #7
0
def cdna_from_file(input_file,
                   hmm_fofn,
                   output=None,
                   reference=None,
                   alignment=None):
    """
    Extract the cDNA sequences from a mixed Fasta or Fastq
    """
    # Check the input files, and align the input file if needed
    alignment_file = get_alignment_file(input_file, reference, alignment)
    output_file = output or get_output_file(input_file, 'cDNA')

    # Prepare the Fasta by orienting and subsetting it
    records = _parse_input_records(input_file)
    hmms = parse_locus_dict(hmm_fofn)
    loci = _parse_loci(alignment_file)

    # Compose and output the records
    cdna_records = list(cdna_from_records(records, loci, hmms))
    write_records(cdna_records, output_file)
    return output_file