예제 #1
0
    def __call__(self,
                 amp_analysis,
                 data_file,
                 barcode_file,
                 barcode_string=None,
                 min_snr=None,
                 min_length=None):
        log.info(
            "Beginning Amplicon Analysis resequencing workflow for {0}".format(
                amp_analysis))

        # Pick or create a single file from AA and read it
        amp_analysis_file = get_input_file(amp_analysis)
        amp_analysis_records = list(AmpliconAnalysisReader(amp_analysis_file))

        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader(data_file)
        baxh5_file = os.path.join(self.output, 'baxh5.fofn')
        create_baxh5_fofn(data_file, baxh5_file)

        # Create a Reader for the Barcode data and find the overlap with any
        # barcodes specified by the user
        bc_reader = get_barcode_reader(barcode_file)
        bc_list = get_barcodes(bc_reader, barcode_string)

        for i, bc in enumerate(bc_list):
            log.info('Resequencing Barcode {0} (#{1} of {2})'.format(
                bc, i + 1, len(bc_list)))
            output_dir = self.get_output_folder(bc)

            # Extract any consensus sequences associated with this barcode
            record_list = [r for r in amp_analysis_records if r.barcode == bc]
            log.info(
                'Identified {0} consensus sequences for Barcode {1}'.format(
                    len(record_list), bc))
            filtered_records = [r for r in record_list if r.num_reads >= 20]
            unique_records = get_unique_records(filtered_records)
            fraction = 100 * round(
                len(unique_records) / float(len(record_list)), 3)
            log.info(
                '{0} of {1} ({2}%) consensus sequences passed all filters'.
                format(len(unique_records), len(record_list), fraction))
            reference_file = os.path.join(output_dir, 'reference.fasta')
            write_records(unique_records, reference_file)

            # Identify all high-quality, barcode-specific ZMWs and write them to file
            zmw_list = get_barcode_zmws(bc_reader, bc)
            zmw_list = filter_zmw_list(bash5, zmw_list, min_snr=min_snr)
            whitelist_file = os.path.join(output_dir, 'whitelist.txt')
            write_zmw_whitelist(bash5, zmw_list, whitelist_file)

            # Resequence the selected consensus sequences with the selected ZMWs
            self.resequencer(baxh5_file,
                             whitelist_file,
                             reference_file,
                             output=output_dir,
                             min_length=min_length)

            log.info("Finished resequencing Barcode {0}\n".format(bc))
예제 #2
0
def type_sequences( input, grouping=GROUPING,
                           exon_fofn=None,
                           genomic_reference=None,
                           cDNA_reference=None,
                           loci=None):
    """
    Pick the top Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    log_file = get_log_file( input )
    initialize_logger( log, log_file=log_file )

    # First, get any references not specified by the user
    grouping = grouping or GROUPING
    exon_fofn = exon_fofn or get_exon_reference()
    genomic_reference = genomic_reference or get_genomic_reference()
    cDNA_reference = cDNA_reference or get_cDNA_reference()

    # Second, get the input file if a directory was specified
    sequence_file = get_input_file( input )

    # Finally, run the Typing procedure
    renamed_file = rename_sequences( sequence_file )
    raw_alignment = full_align_best_reference( renamed_file, genomic_reference )
    reoriented = orient_sequences( renamed_file, alignment_file=raw_alignment )
    selected = extract_alleles( reoriented, alignment_file=raw_alignment,
                                            method=grouping,
                                            loci=loci)
    gDNA_alignment = full_align_best_reference( selected, genomic_reference )
    cDNA_file = extract_cDNA( selected, exon_fofn, alignment_file=gDNA_alignment )
    cDNA_alignment = align_by_identity( cDNA_file, cDNA_reference )
    typing = summarize_typing( gDNA_alignment, cDNA_alignment )
    return typing
예제 #3
0
    def __call__(self, amp_analysis, data_file, barcode_file, barcode_string=None, min_snr=None, min_length=None):
        log.info("Beginning Amplicon Analysis resequencing workflow for {0}".format(amp_analysis))

        # Pick or create a single file from AA and read it
        amp_analysis_file = get_input_file( amp_analysis )
        amp_analysis_records = list(AmpliconAnalysisReader(amp_analysis_file))

        # Convert the raw data file into a BaxH5 fofn for use downstream
        # and create appropriate reader for local access
        bash5 = get_bash5_reader( data_file )
        baxh5_file = os.path.join( self.output, 'baxh5.fofn')
        create_baxh5_fofn( data_file, baxh5_file )

        # Create a Reader for the Barcode data and find the overlap with any
        # barcodes specified by the user
        bc_reader = get_barcode_reader( barcode_file )
        bc_list = get_barcodes( bc_reader, barcode_string )

        for i, bc in enumerate( bc_list ):
            log.info('Resequencing Barcode {0} (#{1} of {2})'.format(bc, i+1, len(bc_list)))
            output_dir = self.get_output_folder( bc )

            # Extract any consensus sequences associated with this barcode
            record_list = [r for r in amp_analysis_records if r.barcode == bc]
            log.info('Identified {0} consensus sequences for Barcode {1}'.format(len(record_list), bc))
            filtered_records = [r for r in record_list if r.num_reads >= 20]
            unique_records = get_unique_records( filtered_records )
            fraction = 100 * round(len(unique_records)/float(len(record_list)), 3)
            log.info('{0} of {1} ({2}%) consensus sequences passed all filters'.format(len(unique_records),
                                                                                       len(record_list),
                                                                                       fraction))
            reference_file = os.path.join( output_dir, 'reference.fasta' )
            write_records( unique_records, reference_file )

            # Identify all high-quality, barcode-specific ZMWs and write them to file
            zmw_list = get_barcode_zmws( bc_reader, bc )
            zmw_list = filter_zmw_list( bash5, zmw_list, min_snr=min_snr )
            whitelist_file = os.path.join( output_dir, 'whitelist.txt' )
            write_zmw_whitelist( bash5, zmw_list, whitelist_file )

            # Resequence the selected consensus sequences with the selected ZMWs
            self.resequencer( baxh5_file,
                              whitelist_file,
                              reference_file,
                              output=output_dir,
                              min_length=min_length )

            log.info("Finished resequencing Barcode {0}\n".format( bc ))
예제 #4
0
    def __call__(
        self,
        amp_analysis,
        data_file,
        barcode_file=None,
        barcode_string=None,
        min_snr=None,
        min_length=None,
        whitelist_file=None,
    ):
        log.info("Beginning Amplicon Analysis resequencing workflow for {0}".format(amp_analysis))

        # Pick or create a single file from AA and read it
        amp_analysis_file = get_input_file(amp_analysis)
        amp_analysis_records = list(AmpliconAnalysisReader(amp_analysis_file))

        if barcode_file is None:
            self.do_resequencing(data_file, amp_analysis_records, min_length, whitelist_file)
        else:
            self.do_barcoded_resequencing(
                data_file, amp_analysis_records, barcode_file, barcode_string, min_snr, min_length
            )
예제 #5
0
    def __call__(self,
                 amp_analysis,
                 data_file,
                 barcode_file=None,
                 barcode_string=None,
                 min_snr=None,
                 min_length=None,
                 whitelist_file=None):
        log.info(
            "Beginning Amplicon Analysis resequencing workflow for {0}".format(
                amp_analysis))

        # Pick or create a single file from AA and read it
        amp_analysis_file = get_input_file(amp_analysis)
        amp_analysis_records = list(AmpliconAnalysisReader(amp_analysis_file))

        if barcode_file is None:
            self.do_resequencing(data_file, amp_analysis_records, min_length,
                                 whitelist_file)
        else:
            self.do_barcoded_resequencing(data_file, amp_analysis_records,
                                          barcode_file, barcode_string,
                                          min_snr, min_length)