Пример #1
0
def generate_histogram(qual_fp, output_dir, score_min=25, verbose=True, qual_parser=parse_qual_score):
    """ Main program function for generating quality score histogram

    qual_fp: quality score filepath
    output_dir: output directory
    score_min: minimum score to be considered a reliable base call, used 
     to generate dotted line on histogram for easy visualization of poor
     quality scores.
    qual_parser : function to apply to extract quality scores
    """

    if qual_fp.endswith(".gz"):
        qual_lines = gzip_open(qual_fp)
    else:
        qual_lines = open(qual_fp, "U")

    qual_scores = qual_parser(qual_lines)

    # Sort bins according to base position
    qual_bins = bin_qual_scores(qual_scores)

    # Get average, std dev, and total nucleotide counts for each base position
    ave_bins, std_dev_bins, total_bases_bins, suggested_trunc_pos = get_qual_stats(qual_bins, score_min)

    plot_qual_report(ave_bins, std_dev_bins, total_bases_bins, score_min, output_dir)

    # Save values to output text file
    write_qual_report(ave_bins, std_dev_bins, total_bases_bins, output_dir, suggested_trunc_pos)

    if verbose:
        print "Suggested nucleotide truncation position (None if quality " + "score average did not fall below the minimum score parameter): %s\n" % suggested_trunc_pos
Пример #2
0
def generate_histogram(qual_fp,
                       output_dir,
                       score_min=25,
                       verbose=True,
                       qual_parser=parse_qual_score):
    """ Main program function for generating quality score histogram

    qual_fp: quality score filepath
    output_dir: output directory
    score_min: minimum score to be considered a reliable base call, used 
     to generate dotted line on histogram for easy visualization of poor
     quality scores.
    qual_parser : function to apply to extract quality scores
    """

    if qual_fp.endswith('.gz'):
        qual_lines = gzip_open(qual_fp)
    else:
        qual_lines = open(qual_fp, "U")

    qual_scores = qual_parser(qual_lines)

    # Sort bins according to base position
    qual_bins = bin_qual_scores(qual_scores)

    # Get average, std dev, and total nucleotide counts for each base position
    ave_bins, std_dev_bins, total_bases_bins, suggested_trunc_pos =\
     get_qual_stats(qual_bins, score_min)

    plot_qual_report(ave_bins, std_dev_bins, total_bases_bins, score_min,
                     output_dir)

    # Save values to output text file
    write_qual_report(ave_bins, std_dev_bins, total_bases_bins, output_dir,
                      suggested_trunc_pos)

    if verbose:
        print "Suggested nucleotide truncation position (None if quality "+\
         "score average did not fall below the minimum score parameter): %s\n"%\
         suggested_trunc_pos
Пример #3
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_id = opts.sample_id
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    filter_bad_illumina_qual_digit = False #opts.filter_bad_illumina_qual_digit
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors
    
    # if this is not a demultiplexed run, 
    if barcode_type == 'not-barcoded':
        if sample_id == None:
            option_parser.error("If not providing barcode reads (because "
            "your data is not multiplexed), must provide a --sample_id.")
        barcode_read_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps == None:
        option_parser.error("Must provide --barcode_fps if "
                            "--barcode_type is not 'not-barcoded'")
    else:
        pass
    
    phred_offset = opts.phred_offset
    if phred_offset != None:
        try:
            phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
        except KeyError:
            # shouldn't be able to get here, but we'll stay on the
            # safe side
            opption_parser.error(\
             "Only valid phred offsets are: %s" %\
             ' '.join(phred_to_ascii_fs.keys()))
    else:
        # let split_libraries_fastq.process_fastq_single_end_read_file 
        # figure it out...
        phred_to_ascii_f = None
    
    if opts.last_bad_quality_char != None:
        option_parser.error('--last_bad_quality_char is no longer supported. '
         'Use -q instead (see option help text by passing -h)')
    
    if not (0 <= min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be between '
         '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction)
    
    try:
        barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type]
    except KeyError:
        barcode_correction_fn = None
    
    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)
    
    if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1:
        option_parser.error("Same number of sequence, barcode and mapping files must be provided.")
    
    output_dir = opts.output_dir
    create_dir(output_dir)
    
    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp,'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir
    
    if store_qual_scores:
        qual_f = open(qual_fp_temp,'w')
        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def qual_writer(h,q):
            qual_f.write('>%s\n%s\n' % (h,q))
    else:
        def qual_writer(h,q):
            pass
    
    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp,'w')
        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def fastq_writer(h,s,q):
            output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h,s,q))
    else:
        def fastq_writer(h,s,q):
            pass
    
    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp,'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp,'w')
    
    for sequence_read_fp, barcode_read_fp, mapping_fp in\
      zip(sequence_read_fps, barcode_read_fps, mapping_fps):
        mapping_f = open(mapping_fp, 'U')
        h, i, barcode_to_sample_id, warnings, errors, p, a =\
           check_map(mapping_f, 
                     disable_primer_check=True, 
                     has_barcodes=barcode_read_fp != None)
        
        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = \
             dict([(DNA.rc(k),v) for k,v in barcode_to_sample_id.items()])
        
        if barcode_type == 'golay_12':
            invalid_golay_barcodes = \
             get_invalid_golay_barcodes(barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error("Some or all barcodes are not valid golay codes. "+\
                "Do they need to be reverse complimented? If these are not "+\
                "golay barcodes pass --barcode_type 12 to disable barcode "+\
                "error correction, or pass --barcode_type # if the barcodes "+\
                "are not 12 base pairs, where # is the size of the barcodes. "+
                "  Invalid codes:\n\t%s" % \
                ' '.join(invalid_golay_barcodes))
        
        log_f.write("Input file paths\n")
        log_f.write('Mapping filepath: %s (md5: %s)\n' %\
          (mapping_fp,safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %\
          (sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest())))
       
        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp,'U')
        

        seq_id = start_seq_id
        
        if barcode_read_fp != None:
            
            log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\
              (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest()))
              
            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp,'U')
            seq_generator = process_fastq_single_end_read_file(
               sequence_read_f,
               barcode_read_f,
               barcode_to_sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp,
               rev_comp_barcode=rev_comp_barcode,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               barcode_correction_fn=barcode_correction_fn,
               max_barcode_errors=max_barcode_errors,
               phred_to_ascii_f=phred_to_ascii_f)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
               sequence_read_f,
               sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               phred_to_ascii_f=phred_to_ascii_f)
        
        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header,sequence))
            qual_writer(fasta_header,quality)
            fastq_writer(fasta_header,sequence,quality)
            
        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')
        
    output_f.close()
    rename(output_fp_temp,output_fp)
    
    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp,qual_fp)
    
    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp,output_fastq_fp)
Пример #4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    read_arguments_from_file = opts.read_arguments_from_file

    # these arguments can optionally be read from a file, reasoning is to
    # allow arguments that would span over hundreds of samples and would be
    # prohibitive to execute as a command line call
    if read_arguments_from_file:
        # sample_ids is the only one of these arguments that's returned as a
        # string, the rest of them are lists
        if opts.sample_ids:
            opts.sample_ids = ','.join(parse_items(opts.sample_ids))
        if opts.sequence_read_fps:
            opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0])
        if opts.barcode_read_fps:
            opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0])
        if opts.mapping_fps:
            opts.mapping_fps = parse_items(opts.mapping_fps[0])

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    # opts.filter_bad_illumina_qual_digit
    filter_bad_illumina_qual_digit = False
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error(
                "If not providing barcode reads (because "
                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error(
                "If providing --sample_ids (because "
                "your data is not multiplexed), must provide the same number "
                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
        mapping_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
                            "--barcode_type is not 'not-barcoded'")
    elif mapping_fps is None:
        option_parser.error("Must provide --mapping_fps if "
                            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_offset = int(phred_offset)
        except ValueError:
            # shouldn't be able to get here...
            option_parser.error(
                "If --phred_offset is provided, it must be a valid integer.")

    if opts.last_bad_quality_char is not None:
        option_parser.error(
            '--last_bad_quality_char is no longer supported. '
            'Use -q instead (see option help text by passing -h)')

    if not (0 < min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be greater '
                            'than 0 and less than or equal to 1. You passed '
                            '%1.5f.' % min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(
            set([
                len(sequence_read_fps),
                len(barcode_read_fps),
                len(mapping_fps)
            ])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
                            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')

        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:

        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')

        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def fastq_writer(h, s, q):
            output_fastq_f.write(format_fastq_record(h, s, q))
    else:

        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        if mapping_fp is not None:
            mapping_f = open(mapping_fp, 'U')
            _, _, barcode_to_sample_id, _, _, _, _ = check_map(
                mapping_f,
                disable_primer_check=True,
                has_barcodes=barcode_read_fp is not None)
        else:
            mapping_f = None
            barcode_to_sample_id = {}

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {
                str(DNA(k).rc()): v
                for k, v in barcode_to_sample_id.iteritems()
            }

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error(
                    "Some or all barcodes are not valid golay "
                    "codes. Do they need to be reverse complemented? If these "
                    "are not golay barcodes pass --barcode_type 12 to disable "
                    "barcode error correction, or pass --barcode_type # if "
                    "the barcodes are not 12 base pairs, where # is the size "
                    "of the barcodes. Invalid codes:\n\t%s" %
                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        if mapping_fp is not None:
            log_f.write('Mapping filepath: %s (md5: %s)\n' %
                        (mapping_fp, safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
                    (sequence_read_fp,
                     str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp is not None:
            log_f.write(
                'Barcode read filepath: %s (md5: %s)\n\n' %
                (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')

            seq_generator = process_fastq_single_end_read_file(
                sequence_read_f,
                barcode_read_f,
                barcode_to_sample_id,
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                rev_comp_barcode=rev_comp_barcode,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                barcode_correction_fn=barcode_correction_fn,
                max_barcode_errors=max_barcode_errors,
                phred_offset=phred_offset)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
                sequence_read_f,
                sample_ids[i],
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                phred_offset=phred_offset)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)
Пример #5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    read_arguments_from_file = opts.read_arguments_from_file

    # these arguments can optionally be read from a file, reasoning is to
    # allow arguments that would span over hundreds of samples and would be
    # prohibitive to execute as a command line call
    if read_arguments_from_file:
        # sample_ids is the only one of these arguments that's returned as a
        # string, the rest of them are lists
        if opts.sample_ids:
            opts.sample_ids = ','.join(parse_items(opts.sample_ids))
        if opts.sequence_read_fps:
            opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0])
        if opts.barcode_read_fps:
            opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0])
        if opts.mapping_fps:
            opts.mapping_fps = parse_items(opts.mapping_fps[0])

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    # opts.filter_bad_illumina_qual_digit
    filter_bad_illumina_qual_digit = False
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error("If not providing barcode reads (because "
                                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error("If providing --sample_ids (because "
                                "your data is not multiplexed), must provide the same number "
                                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
        mapping_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
                            "--barcode_type is not 'not-barcoded'")
    elif mapping_fps is None:
        option_parser.error("Must provide --mapping_fps if "
                            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_offset = int(phred_offset)
        except ValueError:
            # shouldn't be able to get here...
            option_parser.error(
                "If --phred_offset is provided, it must be a valid integer.")

    if opts.last_bad_quality_char is not None:
        option_parser.error('--last_bad_quality_char is no longer supported. '
                            'Use -q instead (see option help text by passing -h)')

    if not (0 < min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be greater '
                            'than 0 and less than or equal to 1. You passed '
                            '%1.5f.' % min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(set([len(sequence_read_fps), len(barcode_read_fps),
                len(mapping_fps)])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
                            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')
        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:
        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')
        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def fastq_writer(h, s, q):
            output_fastq_f.write(format_fastq_record(h, s, q))
    else:
        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        if mapping_fp is not None:
            mapping_f = open(mapping_fp, 'U')
            _, _, barcode_to_sample_id, _, _, _, _ = check_map(mapping_f,
                disable_primer_check=True,
                has_barcodes=barcode_read_fp is not None)
        else:
            mapping_f = None
            barcode_to_sample_id = {}

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {str(DNA(k).rc()): v for k, v in
                                    barcode_to_sample_id.iteritems()}

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error("Some or all barcodes are not valid golay "
                                    "codes. Do they need to be reverse complemented? If these "
                                    "are not golay barcodes pass --barcode_type 12 to disable "
                                    "barcode error correction, or pass --barcode_type # if "
                                    "the barcodes are not 12 base pairs, where # is the size "
                                    "of the barcodes. Invalid codes:\n\t%s" %
                                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        if mapping_fp is not None:
            log_f.write('Mapping filepath: %s (md5: %s)\n' %
                        (mapping_fp, safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
                    (sequence_read_fp,
                     str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp is not None:
            log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %
                        (barcode_read_fp,
                         safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')

            seq_generator = process_fastq_single_end_read_file(
                sequence_read_f, barcode_read_f, barcode_to_sample_id,
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode,
                seq_max_N=seq_max_N, start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f, histogram_f=histogram_f,
                barcode_correction_fn=barcode_correction_fn,
                max_barcode_errors=max_barcode_errors,
                phred_offset=phred_offset)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
                sequence_read_f, sample_ids[i],
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp, seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f, histogram_f=histogram_f,
                phred_offset=phred_offset)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)
#!/usr/bin/env python
# this is from this GIST
# https://gist.github.com/7a1e8608b82c5ce580c9.git
from __future__ import division

from sys import argv

from qiime.util import parse_command_line_parameters, make_option, gzip_open

if argv[1].endswith('.gz'):
    f = gzip_open(argv[1])
else:
    f = open(argv[1],'U')

outf = open(argv[2], "w")

barcode_len = int(argv[3])

barcodes = {}

counter = 0

total_bcs = 0

for line in f:
    if counter == 1:
        curr_barcode = line.strip()[0:barcode_len]

        try:
            barcodes[curr_barcode] += 1
        except KeyError:
Пример #7
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    filter_bad_illumina_qual_digit = False  #opts.filter_bad_illumina_qual_digit
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error(
                "If not providing barcode reads (because "
                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error(
                "If providing --sample_ids (because "
                "your data is not multiplexed), must provide the same number "
                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps == None:
        option_parser.error("Must provide --barcode_fps if "
                            "--barcode_type is not 'not-barcoded'")
    else:
        pass

    phred_offset = opts.phred_offset
    if phred_offset != None:
        try:
            phred_to_ascii_f = phred_to_ascii_fs[phred_offset]
        except KeyError:
            # shouldn't be able to get here, but we'll stay on the
            # safe side
            opption_parser.error(\
             "Only valid phred offsets are: %s" %\
             ' '.join(phred_to_ascii_fs.keys()))
    else:
        # let split_libraries_fastq.process_fastq_single_end_read_file
        # figure it out...
        phred_to_ascii_f = None

    if opts.last_bad_quality_char != None:
        option_parser.error(
            '--last_bad_quality_char is no longer supported. '
            'Use -q instead (see option help text by passing -h)')

    if not (0 <= min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be between '
                            '0 and 1 (inclusive). You passed %1.5f' %
                            min_per_read_length_fraction)

    try:
        barcode_correction_fn = BARCODE_DECODER_LOOKUP[barcode_type]
    except KeyError:
        barcode_correction_fn = None

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(
            set([
                len(sequence_read_fps),
                len(barcode_read_fps),
                len(mapping_fps)
            ])) > 1:
        option_parser.error(
            "Same number of sequence, barcode and mapping files must be provided."
        )

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')

        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:

        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')

        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below
        def fastq_writer(h, s, q):
            output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h, s, q))
    else:

        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        mapping_f = open(mapping_fp, 'U')
        _, _, barcode_to_sample_id, _, _, _, _ = check_map(
            mapping_f,
            disable_primer_check=True,
            has_barcodes=barcode_read_fp is not None)

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = \
             dict([(DNA.rc(k),v) for k,v in barcode_to_sample_id.items()])

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = \
             get_invalid_golay_barcodes(barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error("Some or all barcodes are not valid golay codes. "+\
                "Do they need to be reverse complimented? If these are not "+\
                "golay barcodes pass --barcode_type 12 to disable barcode "+\
                "error correction, or pass --barcode_type # if the barcodes "+\
                "are not 12 base pairs, where # is the size of the barcodes. "+
                "  Invalid codes:\n\t%s" % \
                ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        log_f.write('Mapping filepath: %s (md5: %s)\n' %\
          (mapping_fp,safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %\
          (sequence_read_fp,str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp != None:

            log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %\
              (barcode_read_fp,safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')
            seq_generator = process_fastq_single_end_read_file(
               sequence_read_f,
               barcode_read_f,
               barcode_to_sample_id,
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp,
               rev_comp_barcode=rev_comp_barcode,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               barcode_correction_fn=barcode_correction_fn,
               max_barcode_errors=max_barcode_errors,
               phred_to_ascii_f=phred_to_ascii_f)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
               sequence_read_f,
               sample_ids[i],
               store_unassigned=retain_unassigned_reads,
               max_bad_run_length=max_bad_run_length,
               phred_quality_threshold=phred_quality_threshold,
               min_per_read_length_fraction=min_per_read_length_fraction,
               rev_comp=rev_comp,
               seq_max_N=seq_max_N,
               start_seq_id=start_seq_id,
               filter_bad_illumina_qual_digit=\
                filter_bad_illumina_qual_digit,
               log_f=log_f,
               histogram_f=histogram_f,
               phred_to_ascii_f=phred_to_ascii_f)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)