def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_ids = None if opts.sample_ids is not None: sample_ids = opts.sample_ids.split(',') mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD # opts.filter_bad_illumina_qual_digit filter_bad_illumina_qual_digit = False store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_ids is None: option_parser.error("If not providing barcode reads (because " "your data is not multiplexed), must provide --sample_ids.") if len(sample_ids) != len(sequence_read_fps): option_parser.error("If providing --sample_ids (because " "your data is not multiplexed), must provide the same number " "of sample ids as sequence read filepaths.") barcode_read_fps = [None] * len(sequence_read_fps) mapping_fps = [None] * len(sequence_read_fps) elif barcode_read_fps is None: option_parser.error("Must provide --barcode_read_fps if " "--barcode_type is not 'not-barcoded'") elif mapping_fps is None: option_parser.error("Must provide --mapping_fps if " "--barcode_type is not 'not-barcoded'") phred_offset = opts.phred_offset if phred_offset is not None: try: phred_to_ascii_f = phred_to_ascii_fs[phred_offset] except KeyError: # shouldn't be able to get here, but we'll stay on the # safe side option_parser.error("Only valid phred offsets are: %s" % ' '.join(phred_to_ascii_fs.keys())) else: # let split_libraries_fastq.process_fastq_single_end_read_file # figure it out... phred_to_ascii_f = None if opts.last_bad_quality_char is not None: option_parser.error('--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 <= min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be between ' '0 and 1 (inclusive). You passed %1.5f' % min_per_read_length_fraction) barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None) if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len(set([len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps)])) > 1: option_parser.error("Same number of sequence, barcode, and mapping " "files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp, 'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp, 'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h, q): qual_f.write('>%s\n%s\n' % (h, q)) else: def qual_writer(h, q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp, 'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h, s, q): output_fastq_f.write('@%s\n%s\n+\n%s\n' % (h, s, q)) else: def fastq_writer(h, s, q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp, 'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp, 'w') for i in range(len(sequence_read_fps)): sequence_read_fp = sequence_read_fps[i] barcode_read_fp = barcode_read_fps[i] mapping_fp = mapping_fps[i] if mapping_fp is not None: mapping_f = open(mapping_fp, 'U') _, _, barcode_to_sample_id, _, _, _, _ = check_map(mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp is not None) else: mapping_f = None barcode_to_sample_id = {} if rev_comp_mapping_barcodes: barcode_to_sample_id = {str(DNA(k).rc()): v for k, v in barcode_to_sample_id.iteritems()} if barcode_type == 'golay_12': invalid_golay_barcodes = get_invalid_golay_barcodes( barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error("Some or all barcodes are not valid golay " "codes. Do they need to be reverse complemented? If these " "are not golay barcodes pass --barcode_type 12 to disable " "barcode error correction, or pass --barcode_type # if " "the barcodes are not 12 base pairs, where # is the size " "of the barcodes. Invalid codes:\n\t%s" % ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") if mapping_fp is not None: log_f.write('Mapping filepath: %s (md5: %s)\n' % (mapping_fp, safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' % (sequence_read_fp, str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp, 'U') seq_id = start_seq_id if barcode_read_fp is not None: log_f.write('Barcode read filepath: %s (md5: %s)\n\n' % (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp, 'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_to_ascii_f=phred_to_ascii_f) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_ids[i], store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_to_ascii_f=phred_to_ascii_f) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header, sequence)) qual_writer(fasta_header, quality) fastq_writer(fasta_header, sequence, quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp, output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp, qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp, output_fastq_fp)
def main(): option_parser, opts, args = parse_command_line_parameters(**script_info) read_arguments_from_file = opts.read_arguments_from_file # these arguments can optionally be read from a file, reasoning is to # allow arguments that would span over hundreds of samples and would be # prohibitive to execute as a command line call if read_arguments_from_file: # sample_ids is the only one of these arguments that's returned as a # string, the rest of them are lists if opts.sample_ids: opts.sample_ids = ','.join(parse_items(opts.sample_ids)) if opts.sequence_read_fps: opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0]) if opts.barcode_read_fps: opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0]) if opts.mapping_fps: opts.mapping_fps = parse_items(opts.mapping_fps[0]) sequence_read_fps = opts.sequence_read_fps barcode_read_fps = opts.barcode_read_fps sample_ids = None if opts.sample_ids is not None: sample_ids = opts.sample_ids.split(',') mapping_fps = opts.mapping_fps phred_quality_threshold = opts.phred_quality_threshold retain_unassigned_reads = opts.retain_unassigned_reads min_per_read_length_fraction = opts.min_per_read_length_fraction max_bad_run_length = opts.max_bad_run_length rev_comp = opts.rev_comp rev_comp_barcode = opts.rev_comp_barcode rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes seq_max_N = opts.sequence_max_n start_seq_id = opts.start_seq_id # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD # opts.filter_bad_illumina_qual_digit filter_bad_illumina_qual_digit = False store_qual_scores = opts.store_qual_scores store_demultiplexed_fastq = opts.store_demultiplexed_fastq barcode_type = opts.barcode_type max_barcode_errors = opts.max_barcode_errors # if this is not a demultiplexed run, if barcode_type == 'not-barcoded': if sample_ids is None: option_parser.error( "If not providing barcode reads (because " "your data is not multiplexed), must provide --sample_ids.") if len(sample_ids) != len(sequence_read_fps): option_parser.error( "If providing --sample_ids (because " "your data is not multiplexed), must provide the same number " "of sample ids as sequence read filepaths.") barcode_read_fps = [None] * len(sequence_read_fps) mapping_fps = [None] * len(sequence_read_fps) elif barcode_read_fps is None: option_parser.error("Must provide --barcode_read_fps if " "--barcode_type is not 'not-barcoded'") elif mapping_fps is None: option_parser.error("Must provide --mapping_fps if " "--barcode_type is not 'not-barcoded'") phred_offset = opts.phred_offset if phred_offset is not None: try: phred_offset = int(phred_offset) except ValueError: # shouldn't be able to get here... option_parser.error( "If --phred_offset is provided, it must be a valid integer.") if opts.last_bad_quality_char is not None: option_parser.error( '--last_bad_quality_char is no longer supported. ' 'Use -q instead (see option help text by passing -h)') if not (0 < min_per_read_length_fraction <= 1): option_parser.error('--min_per_read_length_fraction must be greater ' 'than 0 and less than or equal to 1. You passed ' '%1.5f.' % min_per_read_length_fraction) barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None) if len(mapping_fps) == 1 and len(sequence_read_fps) > 1: mapping_fps = mapping_fps * len(sequence_read_fps) if len( set([ len(sequence_read_fps), len(barcode_read_fps), len(mapping_fps) ])) > 1: option_parser.error("Same number of sequence, barcode, and mapping " "files must be provided.") output_dir = opts.output_dir create_dir(output_dir) output_fp_temp = '%s/seqs.fna.incomplete' % output_dir output_fp = '%s/seqs.fna' % output_dir output_f = open(output_fp_temp, 'w') qual_fp_temp = '%s/qual.fna.incomplete' % output_dir qual_fp = '%s/seqs.qual' % output_dir output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir output_fastq_fp = '%s/seqs.fastq' % output_dir if store_qual_scores: qual_f = open(qual_fp_temp, 'w') # define a qual writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def qual_writer(h, q): qual_f.write('>%s\n%s\n' % (h, q)) else: def qual_writer(h, q): pass if store_demultiplexed_fastq: output_fastq_f = open(output_fastq_fp_temp, 'w') # define a fastq writer whether we're storing # qual strings or not so we don't have to check # every time through the for loop below def fastq_writer(h, s, q): output_fastq_f.write(format_fastq_record(h, s, q)) else: def fastq_writer(h, s, q): pass log_fp = '%s/split_library_log.txt' % output_dir log_f = open(log_fp, 'w') histogram_fp = '%s/histograms.txt' % output_dir histogram_f = open(histogram_fp, 'w') for i in range(len(sequence_read_fps)): sequence_read_fp = sequence_read_fps[i] barcode_read_fp = barcode_read_fps[i] mapping_fp = mapping_fps[i] if mapping_fp is not None: mapping_f = open(mapping_fp, 'U') _, _, barcode_to_sample_id, _, _, _, _ = check_map( mapping_f, disable_primer_check=True, has_barcodes=barcode_read_fp is not None) else: mapping_f = None barcode_to_sample_id = {} if rev_comp_mapping_barcodes: barcode_to_sample_id = { str(DNA(k).rc()): v for k, v in barcode_to_sample_id.iteritems() } if barcode_type == 'golay_12': invalid_golay_barcodes = get_invalid_golay_barcodes( barcode_to_sample_id.keys()) if len(invalid_golay_barcodes) > 0: option_parser.error( "Some or all barcodes are not valid golay " "codes. Do they need to be reverse complemented? If these " "are not golay barcodes pass --barcode_type 12 to disable " "barcode error correction, or pass --barcode_type # if " "the barcodes are not 12 base pairs, where # is the size " "of the barcodes. Invalid codes:\n\t%s" % ' '.join(invalid_golay_barcodes)) log_f.write("Input file paths\n") if mapping_fp is not None: log_f.write('Mapping filepath: %s (md5: %s)\n' % (mapping_fp, safe_md5(open(mapping_fp)).hexdigest())) log_f.write('Sequence read filepath: %s (md5: %s)\n' % (sequence_read_fp, str(safe_md5(open(sequence_read_fp)).hexdigest()))) if sequence_read_fp.endswith('.gz'): sequence_read_f = gzip_open(sequence_read_fp) else: sequence_read_f = open(sequence_read_fp, 'U') seq_id = start_seq_id if barcode_read_fp is not None: log_f.write( 'Barcode read filepath: %s (md5: %s)\n\n' % (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest())) if barcode_read_fp.endswith('.gz'): barcode_read_f = gzip_open(barcode_read_fp) else: barcode_read_f = open(barcode_read_fp, 'U') seq_generator = process_fastq_single_end_read_file( sequence_read_f, barcode_read_f, barcode_to_sample_id, store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, barcode_correction_fn=barcode_correction_fn, max_barcode_errors=max_barcode_errors, phred_offset=phred_offset) else: seq_generator = process_fastq_single_end_read_file_no_barcode( sequence_read_f, sample_ids[i], store_unassigned=retain_unassigned_reads, max_bad_run_length=max_bad_run_length, phred_quality_threshold=phred_quality_threshold, min_per_read_length_fraction=min_per_read_length_fraction, rev_comp=rev_comp, seq_max_N=seq_max_N, start_seq_id=start_seq_id, filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit, log_f=log_f, histogram_f=histogram_f, phred_offset=phred_offset) for fasta_header, sequence, quality, seq_id in seq_generator: output_f.write('>%s\n%s\n' % (fasta_header, sequence)) qual_writer(fasta_header, quality) fastq_writer(fasta_header, sequence, quality) start_seq_id = seq_id + 1 log_f.write('\n---\n\n') output_f.close() rename(output_fp_temp, output_fp) # process the optional output files, as necessary if store_qual_scores: qual_f.close() rename(qual_fp_temp, qual_fp) if store_demultiplexed_fastq: output_fastq_f.close() rename(output_fastq_fp_temp, output_fastq_fp)