def assign_seqs(file_data, ids_bcs_added_field, bc_lens, all_bcs, keep_barcode=False, barcode_type="golay_12", max_bc_errors=1.5, start_index=1, write_unassigned_reads=False, disable_bc_correction=False, added_demultiplex_field=None): """ Demultiplexes, writes seqs/qual files, returns log data file_data: dict of open file objects, contains input fasta, qual, and mapping files, and output filepaths for partially demultiplexed fasta and qual files, and unassigned sequence output file. ids_bcs_added_field: dict of (barcode,added_demultiplex): SampleID bc_lens: Lengths of all barcodes from largest to smallest. all_bcs: List of all barcode sequences. keep_barcode: If True, will not remove barcode from output files. barcode_type: Specified barcode, can be golay_12, hamming_8, variable_length, or an integer specifying length. max_bc_errors: Number of changes allowed for error correcting barcodes, for generic barcodes, specifies the number of mismatches allowed. start_index: Specifies the first number used to enumerate output sequences. write_unassigned_reads: If True, will write sequences that could not be demultiplexed into a separate output file. disable_bc_correction: Only tests for exact matches to barcodes. added_demultiplex_field: Uses data supplied in metadata mapping field and demultiplexes according to data in fasta labels. save_barcode_frequencies: Saves the frequencies of barcode sequences in a separate output file. """ log_data = initialize_log_data(ids_bcs_added_field) bc_freqs = defaultdict(int) seq_counts = 0 enum_val = start_index corrected_bc_count = [0, 0] if file_data['qual_files']: for curr_fasta, curr_qual in zip(file_data['fasta_files'], file_data['qual_files']): for fasta_data, qual_data in izip(MinimalFastaParser(curr_fasta), MinimalQualParser(curr_qual, full_header=True)): seq_counts += 1 fasta_label, fasta_seq = fasta_data qual_label, qual_seq = qual_data bc, corrected_bc, num_errors, added_field =\ get_demultiplex_data(ids_bcs_added_field, fasta_label, fasta_seq, bc_lens, all_bcs, barcode_type, max_bc_errors, disable_bc_correction, added_demultiplex_field) bc_freqs[bc] += 1 sample_id, log_id, bc_corrected_result =\ get_output_ids(ids_bcs_added_field, corrected_bc, num_errors, added_field, max_bc_errors, enum_val) if bc_corrected_result == 'corrected': corrected_bc_count[0] += 1 if bc_corrected_result == 'not_corrected': corrected_bc_count[1] += 1 label_line = get_label_line(sample_id, fasta_label, bc, corrected_bc, num_errors) if sample_id.startswith("Unassigned") and\ write_unassigned_reads: write_fasta_line(file_data['unassigned_seqs_f'], fasta_seq, label_line, True, len(bc)) write_qual_line(file_data['unassigned_qual_f'], list(qual_seq), label_line, True, len(bc)) elif not sample_id.startswith("Unassigned"): write_fasta_line(file_data['demultiplexed_seqs_f'], fasta_seq, label_line, keep_barcode, len(bc)) write_qual_line(file_data['demultiplexed_qual_f'], list(qual_seq), label_line, keep_barcode, len(bc)) if log_id: log_data[log_id] += 1 enum_val += 1 else: for curr_fasta in file_data['fasta_files']: for fasta_label, fasta_seq in MinimalFastaParser(curr_fasta): seq_counts += 1 bc, corrected_bc, num_errors, added_field =\ get_demultiplex_data(ids_bcs_added_field, fasta_label, fasta_seq, bc_lens, all_bcs, barcode_type, max_bc_errors, disable_bc_correction, added_demultiplex_field) bc_freqs[bc] += 1 sample_id, log_id, bc_corrected_result =\ get_output_ids(ids_bcs_added_field, corrected_bc, num_errors, added_field, max_bc_errors, enum_val) if bc_corrected_result == 'corrected': corrected_bc_count[0] += 1 if bc_corrected_result == 'not_corrected': corrected_bc_count[1] += 1 label_line = get_label_line(sample_id, fasta_label, bc, corrected_bc, num_errors) if sample_id.startswith("Unassigned") and\ write_unassigned_reads: write_fasta_line(file_data['unassigned_seqs_f'], fasta_seq, label_line, True, len(bc)) elif not sample_id.startswith("Unassigned"): write_fasta_line(file_data['demultiplexed_seqs_f'], fasta_seq, label_line, keep_barcode, len(bc)) if log_id: log_data[log_id] += 1 enum_val += 1 return log_data, bc_freqs, seq_counts, corrected_bc_count
def convert_fastq(fasta_file_path, qual_file_path, output_directory='.', multiple_output_files=False, ascii_increment=33, full_fastq=False, full_fasta_headers=False, per_file_buffer_size=100000): '''Takes a FASTA and QUAL file, generates FASTQ file(s) fasta_file_path: filepath of input FASTA file. qual_file_path: filepath of input QUAL file (needed for making FASTQ files) output_directory: Directory to output converted files. multiple_output_files: Make one file per SampleID. ascii_increment: Conversion value for fastq ascii character to numeric quality score. full_fastq: Write labels to both sequence and quality score lines. full_fasta_headers: Retain all data on fasta label, instead of breaking at first whitespace.''' fasta_file = open(fasta_file_path,'U') qual_file = open(qual_file_path,'U') # if we're not using multiple output files, we can open the one (and only) # output file right now if not multiple_output_files: output_file_path = get_filename_with_new_ext(fasta_file_path, '.fastq', output_directory) fastq_file = open(output_file_path, 'w') else: fastq_lookup = defaultdict(str) # iterate through the FASTA and QUAL files entry by entry (assume the # entries are synchronized) for fasta_data, qual_data in izip(MinimalFastaParser(fasta_file), MinimalQualParser(qual_file)): qual_header = qual_data[0] fasta_header = fasta_data[0] label = fasta_header.split()[0] sample_id = label.split('_')[0] sequence = fasta_data[1] qual = qual_data[1] # check whether the entries are actually (at least nominally) synch'd if qual_header != label: raise KeyError, ("QUAL header (%s) does not match " "FASTA header (%s)") % (qual_header, label) if len(sequence) != len(qual): raise KeyError, ("Sequence length does not match QUAL length for " "label (%s)") % label if multiple_output_files: output_file_path = get_filename_with_new_ext(fasta_file_path, '_' + sample_id + '.fastq', output_directory) # when we use multiple output files, we close each file after each # sequence is written to avoid using up all the file handles, so # we must open the file each time in append mode # fastq_file = open(output_file_path, 'a') if full_fasta_headers: fastq_sequence_header = fasta_header else: fastq_sequence_header = label if full_fastq: fastq_quality_header = fastq_sequence_header else: fastq_quality_header = '' #Writing to FASTQ file record = '@%s\n%s\n+%s\n' % (fastq_sequence_header, sequence, fastq_quality_header) if multiple_output_files: fastq_lookup[output_file_path] += record else: fastq_file.write(record) for qual_score in qual: # increment the qual score by the asciiIncrement (default 33), # and print the corresponding character, which represents that # position's quality. qual_score += ascii_increment if qual_score < 32 or qual_score > 126: raise ValueError,("Cannot convert quality score to ASCII code"+ " between 32 and 126: " + str(qual_score - ascii_increment) + "using ascii_increment = " + str(ascii_increment)) if multiple_output_files: fastq_lookup[output_file_path] += chr(qual_score) else: fastq_file.write(chr(qual_score)) if multiple_output_files: fastq_lookup[output_file_path] += '\n' else: fastq_file.write('\n') if multiple_output_files: if len(fastq_lookup[output_file_path]) >= per_file_buffer_size: fastq_file = open(output_file_path, 'a') fastq_file.write(fastq_lookup[output_file_path]) fastq_lookup[output_file_path] = '' fastq_file.close() # write last seqs to output files, or close the output file if thre is only # one if multiple_output_files: for output_file_path, records in fastq_lookup.iteritems(): if records: fastq_file = open(output_file_path, 'a') fastq_file.write(records) fastq_file.close() else: fastq_file.close()
def convert_fastq(fasta_file_path, qual_file_path, output_directory='.', multiple_output_files=False, ascii_increment=33, full_fastq=False, full_fasta_headers=False): '''Takes a FASTA and QUAL file, generates FASTQ file(s) fasta_file_path: filepath of input FASTA file. qual_file_path: filepath of input QUAL file (needed for making FASTQ files) output_directory: Directory to output converted files. multiple_output_files: Make one file per SampleID. ascii_increment: Conversion value for fastq ascii character to numeric quality score. full_fastq: Write labels to both sequence and quality score lines. full_fasta_headers: Retain all data on fasta label, instead of breaking at first whitespace.''' output_files = {} fasta_file = open(fasta_file_path, 'U') qual_file = open(qual_file_path, 'U') # Need to open file the first time as "w", thereafter open as "a" sample_ids_written = {} for fasta_data, qual_data in izip(MinimalFastaParser(fasta_file), MinimalQualParser(qual_file)): qual_header = qual_data[0] fasta_header = fasta_data[0] label = fasta_header.split()[0] sample_id = label.split('_')[0] sequence = fasta_data[1] qual = qual_data[1] try: quality_scores = qual_data[1] except KeyError: raise KeyError,("No entry in QUAL file for label: %s\n" % \ label) if qual_header != label: raise KeyError,("Fasta(%s) and qual(%s) headers don't match" %\ (label, qual_header)) if len(qual) != len(sequence): raise KeyError,("Number of quality scores "+\ "(%d) does not match number of positions (%d) for label: %s" %\ (len(qual), len(sequence), label)) if not multiple_output_files: output_file_path = path.join(output_directory, \ path.splitext(path.split(fasta_file_path)[1])[0] + '.fastq') if output_file_path in sample_ids_written.keys(): sample_ids_written[output_file_path] = True else: sample_ids_written[output_file_path] = False try: # Create new file if first time writing, else append if sample_ids_written[output_file_path]: fastq_file = open(output_file_path, 'a') else: fastq_file = open(output_file_path, 'w') except IOError: qual_file.close() fasta_file.close() raise IOError,("Could not open FASTQ file for writing: " \ + output_file_path + '\n') output_files[sample_id] = output_file_path if multiple_output_files: if sample_id not in output_files: output_file_path = path.join(output_directory, \ path.splitext(path.split(fasta_file_path)[1])[0] + \ '_' + sample_id + '.fastq') if output_file_path in sample_ids_written.keys(): sample_ids_written[output_file_path] = True else: sample_ids_written[output_file_path] = False try: # Create new file if first time writing, else append if sample_ids_written[output_file_path]: output_files[sample_id] = open(output_file_path, 'a') else: output_files[sample_id] = open(output_file_path, 'w') except IOError: raise IOError,("Could not open FASTQ file for writing: " \ + output_file_path + '\n') output_files[sample_id] = output_file_path fastq_file = open(output_files[sample_id], 'a') if full_fasta_headers: fastq_sequence_header = fasta_header else: fastq_sequence_header = label if full_fastq: fastq_quality_header = fastq_sequence_header else: fastq_quality_header = '' #Writing to FASTQ file fastq_file.write('@' + fastq_sequence_header + '\n') fastq_file.write(sequence + '\n') fastq_file.write('+' + fastq_quality_header + '\n') qual_scores = list(qual) for qual_score in qual_scores: # increment the qual score by the asciiIncrement (default 33), # and print the corresponding character, which represents that # position's quality. qual_score += ascii_increment if qual_score < 32 or qual_score > 126: raise ValueError,("Cannot convert quality score to ASCII code"+\ " between 32 and 126: " + str(qual_score - ascii_increment) +\ "using ascii_increment = " + str(ascii_increment)) fastq_file.write(chr(qual_score)) fastq_file.write('\n') if multiple_output_files: fastq_file.close()