def test_parse_error(self): """Does this raise a FastqParseError with incorrect input?""" with self.assertRaises(FastqParseError): list(parse_fastq(self.FASTQ_EXAMPLE_2, strict=True)) with self.assertRaises(FastqParseError): list(parse_fastq(self.FASTQ_EXAMPLE_3, phred_offset=64))
def write_synced_barcodes_fastq(joined_fp, index_fp): """Writes new index file based on surviving assembled paired-ends. -joined_fp : file path to paired-end assembled fastq file -index_fp : file path to index / barcode reads fastq file This function iterates through the joined reads file and index file. Only those index-reads within the file at index_fp, that have headers matching those within the joined-pairs at joined_fp, are written to file. WARNING: Assumes reads are in the same order in both files, except for cases in which the corresponding read in the joined_fp file is missing (i.e. pairs failed to assemble). """ # open files (handles normal / gzipped data) jh = qiime_open(joined_fp) ih = qiime_open(index_fp) # base new index file name on joined paired-end file name: j_path, ext = os.path.splitext(joined_fp) filtered_bc_outfile_path = j_path + '_barcodes.fastq' fbc_fh = open(filtered_bc_outfile_path, 'w') # Set up iterators index_fastq_iter = parse_fastq(ih, strict=False) joined_fastq_iter = parse_fastq(jh, strict=False) # Write barcodes / index reads that we observed within # the joined paired-ends. Warn if index and joined data # are not in order. for joined_label, joined_seq, joined_qual in joined_fastq_iter: index_label, index_seq, index_qual = index_fastq_iter.next() while joined_label != index_label: try: index_label, index_seq, index_qual = index_fastq_iter.next() except StopIteration: raise StopIteration( "\n\nReached end of index-reads file" + " before iterating through joined paired-end-reads file!" + " Except for missing paired-end reads that did not survive" + " assembly, your index and paired-end reads files must be in" + " the same order! Also, check that the index-reads and" + " paired-end reads have identical headers. The last joined" + " paired-end ID processed was:\n\'%s\'\n" % (joined_label)) else: fastq_string = '@%s\n%s\n+\n%s\n'\ % (index_label, index_seq, index_qual) fbc_fh.write(fastq_string) ih.close() jh.close() fbc_fh.close() return filtered_bc_outfile_path
def write_synced_barcodes_fastq(joined_fp, index_fp): """Writes new index file based on surviving assembled paired-ends. -joined_fp : file path to paired-end assembled fastq file -index_fp : file path to index / barcode reads fastq file This function iterates through the joined reads file and index file. Only those index-reads within the file at index_fp, that have headers matching those within the joined-pairs at joined_fp, are written to file. WARNING: Assumes reads are in the same order in both files, except for cases in which the corresponding read in the joined_fp file is missing (i.e. pairs failed to assemble). """ # open files (handles normal / gzipped data) jh = qiime_open(joined_fp) ih = qiime_open(index_fp) # base new index file name on joined paired-end file name: j_path, ext = os.path.splitext(joined_fp) filtered_bc_outfile_path = j_path + '_barcodes.fastq' fbc_fh = open(filtered_bc_outfile_path, 'w') # Set up iterators index_fastq_iter = parse_fastq(ih, strict=False) joined_fastq_iter = parse_fastq(jh, strict=False) # Write barcodes / index reads that we observed within # the joined paired-ends. Warn if index and joined data # are not in order. for joined_label, joined_seq, joined_qual in joined_fastq_iter: index_label, index_seq, index_qual = index_fastq_iter.next() while joined_label != index_label: try: index_label, index_seq, index_qual = index_fastq_iter.next() except StopIteration: raise StopIteration("\n\nReached end of index-reads file" + " before iterating through joined paired-end-reads file!" + " Except for missing paired-end reads that did not survive" + " assembly, your index and paired-end reads files must be in" + " the same order! Also, check that the index-reads and" + " paired-end reads have identical headers. The last joined" + " paired-end ID processed was:\n\'%s\'\n" % (joined_label)) else: fastq_string = '@%s\n%s\n+\n%s\n'\ % (index_label, index_seq, index_qual) fbc_fh.write(fastq_string) ih.close() jh.close() fbc_fh.close() return filtered_bc_outfile_path
def test_parse(self): """sequence and info objects should correctly match""" for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE, phred_offset=64): self.assertTrue(label in DATA) self.assertEqual(seq, DATA[label]["seq"]) self.assertTrue((qual == DATA[label]["qual"]).all())
def extract_reads_from_interleaved( input_fp, forward_id, reverse_id, output_dir): """Parses a single fastq file and creates two new files: forward and reverse, based on the two values (comma separated) in read_direction_identifiers input_fp: file path to input read_direction_identifiers: comma separated values to identify forward and reverse reads output_folder: file path to the output folder """ forward_fp = join(output_dir, "forward_reads.fastq") reverse_fp = join(output_dir, "reverse_reads.fastq") ffp = open(forward_fp, 'w') rfp = open(reverse_fp, 'w') for label, seq, qual in parse_fastq(qiime_open(input_fp), strict=False): fastq_string = format_fastq_record(label, seq, qual) if forward_id in label: ffp.write(fastq_string) elif reverse_id in label and forward_id not in label: rfp.write(fastq_string) else: ffp.close() rfp.close() raise ValueError("One of the input sequences doesn't have either identifier " "or it has both.\nLabel: %s\nForward: %s\n Reverse: %s" % (label, forward_id, reverse_id)) ffp.close() rfp.close()
def extract_reads_from_interleaved(input_fp, forward_id, reverse_id, output_dir): """Parses a single fastq file and creates two new files: forward and reverse, based on the two values (comma separated) in read_direction_identifiers input_fp: file path to input read_direction_identifiers: comma separated values to identify forward and reverse reads output_folder: file path to the output folder """ forward_fp = join(output_dir, "forward_reads.fastq") reverse_fp = join(output_dir, "reverse_reads.fastq") ffp = open(forward_fp, 'w') rfp = open(reverse_fp, 'w') for label, seq, qual in parse_fastq(qiime_open(input_fp), strict=False, enforce_qual_range=False): fastq_string = format_fastq_record(label, seq, qual) if forward_id in label: ffp.write(fastq_string) elif reverse_id in label and forward_id not in label: rfp.write(fastq_string) else: ffp.close() rfp.close() raise ValueError( "One of the input sequences doesn't have either identifier " "or it has both.\nLabel: %s\nForward: %s\n Reverse: %s" % (label, forward_id, reverse_id)) ffp.close() rfp.close()
def filter_fastq(input_seqs_f, output_seqs_f, seqs_to_keep, negate=False, seqid_f=None): """ Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep input_seqs can be the output of parse_fasta or parse_fastq """ if seqid_f is None: seqs_to_keep_lookup = {}.fromkeys([seq_id.split()[0] for seq_id in seqs_to_keep]) # Define a function based on the value of negate if not negate: def keep_seq(seq_id): return seq_id.split()[0] in seqs_to_keep_lookup else: def keep_seq(seq_id): return seq_id.split()[0] not in seqs_to_keep_lookup else: if not negate: keep_seq = seqid_f else: keep_seq = lambda x: not seqid_f(x) for seq_id, seq, qual in parse_fastq(input_seqs_f, enforce_qual_range=False): if keep_seq(seq_id): output_seqs_f.write(format_fastq_record(seq_id, seq, qual)) output_seqs_f.close()
def filter_fastq(input_seqs_f, output_seqs_f, seqs_to_keep, negate=False, seqid_f=None): """ Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep input_seqs can be the output of parse_fasta or parse_fastq """ if seqid_f is None: seqs_to_keep_lookup = {}.fromkeys( [seq_id.split()[0] for seq_id in seqs_to_keep]) # Define a function based on the value of negate if not negate: def keep_seq(seq_id): return seq_id.split()[0] in seqs_to_keep_lookup else: def keep_seq(seq_id): return seq_id.split()[0] not in seqs_to_keep_lookup else: if not negate: keep_seq = seqid_f else: keep_seq = lambda x: not seqid_f(x) for seq_id, seq, qual in parse_fastq(input_seqs_f, enforce_qual_range=False): if keep_seq(seq_id): output_seqs_f.write(format_fastq_record(seq_id, seq, qual)) output_seqs_f.close()
def split_by_index(read1, read2, barcodes, bc_pos=(26,6)): ''' Splits read pairs given in `read1` and `read2` according to the list of barcodes given in `barcode`. The position and length of the barcode can be specified in `bc_pos` as a (start, length) tuple. ''' output_files = {} # Read name line MUST start with @ fastq_tpl = '@{id}\n{seq}\n+\n{q}\n' cnt = 0 assigned = 0 for rec1, rec2 in izip(parse_fastq(read1), parse_fastq(read2)): id1, seq1, q1 = rec1 id2, seq2, q2 = rec2 cnt += 1 if cnt % 1000000 == 0: print 'Processed\t %d records...' % cnt sys.stdout.flush() istart, ilen = bc_pos ind = seq1[istart:istart+ilen] # It's an exact match for now but we really need to accomodate # mismatches here. Hamming distance? if ind in barcodes: assigned += 1 qstr1 = ''.join([chr(val+33) for val in q1]) qstr2 = ''.join([chr(val+33) for val in q2]) if not(ind in output_files): r1 = gzip.open('../data/%s_R1.fastq.gz' % ind, 'wb') r2 = gzip.open('../data/%s_R2.fastq.gz' % ind, 'wb') print '...created output files for: %s' % ind sys.stdout.flush() output_files[ind] = (r1, r2) output_files[ind][0].write(fastq_tpl.format(id=id1,seq=seq1,q=qstr1)) output_files[ind][1].write(fastq_tpl.format(id=id2,seq=seq2,q=qstr2)) print output_files.keys() print 'Assigned:\t%d sequences' % assigned for ind,files in output_files.items(): f1, f2 = files f1.close() f2.close()
def test_parse(self): for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE, phred_offset=64): self.assertTrue(label in DATA) self.assertEqual(seq, DATA[label]["seq"]) self.assertTrue((qual == DATA[label]["qual"]).all()) # Make sure that enforce_qual_range set to False allows qual scores # to fall outside the typically acceptable range of 0-62 for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE_2, phred_offset=33, enforce_qual_range=False): self.assertTrue(label in DATA_2) self.assertEqual(seq, DATA_2[label]["seq"]) self.assertTrue((qual == DATA_2[label]["qual"]).all()) # This should raise a FastqParseError since the qual scores are # intended to be interpreted with an offset of 64, and using 33 will # make the qual score fall outside the acceptable range of 0-62. with self.assertRaises(FastqParseError): list(parse_fastq(self.FASTQ_EXAMPLE, phred_offset=33))
def fetch_study(study_accession, base_dir): """Fetch and dump a study Grab and dump a study. If sample_accessions are specified, then only those specified samples will be fetched and dumped Parameters ---------- study_accession : str Accession ID for the study base_dir : str Path of base directory to save the fetched results Note ---- If sample_accession is None, then the entire study will be fetched """ if ag.is_test_env(): return 0 study_dir = os.path.join(base_dir, study_accession) if ag.staged_raw_data() is not None: os.symlink(ag.staged_raw_data(), study_dir) elif not os.path.exists(study_dir): os.mkdir(study_dir) new_samples = 0 for sample, fastq_url in fetch_study_details(study_accession): sample_dir = os.path.join(study_dir, sample) if not os.path.exists(sample_dir): # fetch files if it isn't already present os.mkdir(sample_dir) metadata_path = os.path.join(sample_dir, '%s.txt' % sample) fasta_path = os.path.join(sample_dir, '%s.fna' % sample) # write out fasta with open(fasta_path, 'w') as fasta_out: for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)): fasta_out.write(">%s\n%s\n" % (id_, seq)) # write mapping xml url_fmt = "http://www.ebi.ac.uk/ena/data/view/" + \ "%(accession)s&display=xml" res = fetch_url(url_fmt % {'accession': sample}) with open(metadata_path, 'w') as md_f: md_f.write(res.read()) new_samples += 1 return new_samples
def test_parse(self): """sequence and info objects should correctly match""" for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE, phred_offset=64): self.assertTrue(label in DATA) self.assertEqual(seq, DATA[label]["seq"]) self.assertTrue((qual == DATA[label]["qual"]).all()) # Make sure that enforce_qual_range set to False allows qual scores # to fall outside the typically acceptable range of 0-62 for label, seq, qual in parse_fastq(self.FASTQ_EXAMPLE_2, phred_offset=33, enforce_qual_range=False): self.assertTrue(label in DATA_2) self.assertEqual(seq, DATA_2[label]["seq"]) self.assertTrue((qual == DATA_2[label]["qual"]).all()) # This should raise a FastqParseError since the qual scores are # intended to be interpreted with an offset of 64, and using 33 will # make the qual score fall outside the acceptable range of 0-62. with self.assertRaises(FastqParseError): list(parse_fastq(self.FASTQ_EXAMPLE, phred_offset=33))
def parse_fastq_qual_score(fastq_lines): results = {} first_header = fastq_lines.readline() fastq_lines.seek(0) if is_casava_v180_or_later(first_header): ascii_to_phred_f = ascii_to_phred33 else: ascii_to_phred_f = ascii_to_phred64 for header, seq, qual in parse_fastq(fastq_lines): results[header] = asarray(qual, dtype=ascii_to_phred_f) return results
def remove_primers(input_fastq, output_fastq,for_primers,rev_primers, ed_tol): count = 0 with open(input_fastq) as read, open(output_fastq, "w") as out_seqs: for label,seq,qual in parse_fastq(read): for primerF,primerR in zip(for_primers,rev_primers): start_slice = editSearchForward(primerF,seq,ed_tol) end_slice = editSearchReverse(primerR,seq,ed_tol) # print type(start_slice), '\t',end_slice if (start_slice != -1) and (end_slice != -1): curr_seq = seq[start_slice:end_slice] curr_qual = qual[start_slice:end_slice] formatted_fastq_line = format_fastq_record(label, curr_seq, curr_qual) out_seqs.write("%s" % (formatted_fastq_line))
def split_pools(barcode, dirname='../data'): ''' Splits the reads in R2 file of the sample specified by `barcode` into `plus` and `minus` pools. ''' d, _, filenames = os.walk(dirname).next() files = [f for f in filenames if f.startswith(barcode)] files_R1 = [os.path.join(dirname, f) for f in files if 'R1' in f] files_R2 = [os.path.join(dirname, f) for f in files if 'R2' in f] fastq_tpl='@{id}\n{seq}\n+\n{qual}\n' minus_re = re.compile('^[CT][CT][CT][AG]') plus_re = re.compile('^[AG][AG][AG][CT]') for file_R1,file_R2 in zip(files_R1, files_R2): print "Processing files:\t{f1}\t{f2}".format(f1=file_R1, f2=file_R2) sys.stdout.flush() cnt = 0 cnt_plus = 0 cnt_minus = 0 with gzip.open(file_R1, 'rb') as gzr1, gzip.open(file_R2, 'rb') as gzr2, \ gzip.open(os.path.join(dirname, barcode+'_minus.fastq.gz'), 'wb') as gz_minus, \ gzip.open(os.path.join(dirname, barcode+'_plus.fastq.gz'), 'wb') as gz_plus: for rec1,rec2 in izip(parse_fastq(gzr1), parse_fastq(gzr2)): cnt += 1 if cnt % 1000000 == 0: print "\t\t{0}\trecords...".format(cnt) sys.stdout.flush() id1, seq1, qual1 = rec1 id2, seq2, qual2 = rec2 qual_str = ''.join([chr(33+q) for q in qual2]) if minus_re.match(seq1): gz_minus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str)) cnt_minus += 1 elif plus_re.match(seq1): gz_plus.write(fastq_tpl.format(id=id2,seq=seq2,qual=qual_str)) cnt_plus += 1 print "{0}\tplus records\t{1}\tminus records".format(cnt_plus, cnt_minus) sys.stdout.flush()
def remove_primers(input_fastq, output_fastq, primers): count = 0 # USING regex list (Time 11m4) with open(input_fastq) as read, open(output_fastq, "w") as out_seqs: for label, seq, qual in parse_fastq(read): start_slice = 0 if primers.search(seq): start_slice = int(primers.search(seq).end()) curr_seq = seq[start_slice:] curr_qual = qual[start_slice:] if start_slice > 0: formatted_fastq_line = format_fastq_record( label, curr_seq, curr_qual) # print ("%s" % (formatted_fastq_line)) out_seqs.write("%s" % (formatted_fastq_line))
def fetch_study(accession, base_dir): """Fetch and dump a full study Grab and dump a full study """ metadata_path = os.path.join(base_dir, '%s.txt' % accession) fasta_path = os.path.join(base_dir, '%s.fna' % accession) if os.path.exists(fasta_path) and os.path.exists(metadata_path): # it appears we already have the accession, so short circuit return all_md = {} all_cols = set(['BarcodeSequence', 'LinkerPrimerSequence']) md_f = open(metadata_path, 'w') fasta_path = open(fasta_path, 'w') for sample, fastq_url in fetch_study_details(accession): # in the form seqs_000007123.1075697.fastq.gz # and unfortunately, the suffix (1075697) is missing and parts of the # current results processing depend on the suffix. fastq_filename = fastq_url.rsplit('/')[-1] qiimedb_samplename = fastq_filename.split('_')[-1].rsplit('.', 2)[0] md = fetch_metadata_xml(sample) all_md[qiimedb_samplename] = md all_cols.update(md) # write out fasta try: for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)): fasta_path.write(">%s\n%s\n" % (id_, seq)) except: continue header = list(all_cols) md_f.write('#SampleID\t') md_f.write('\t'.join(header)) md_f.write('\n') for sampleid, values in all_md.iteritems(): to_write = [values.get(k, "no_data").encode('utf-8') for k in header] to_write.insert(0, sampleid) md_f.write('\t'.join(to_write)) md_f.write('\n') md_f.close() fasta_path.close()
def fetch_study(accession, metadata_path, fasta_path): """Fetch and dump a full study Grab and dump a full study """ all_md = {} all_cols = set(['BarcodeSequence', 'LinkerPrimerSequence']) md_f = open(metadata_path, 'w') fasta_path = open(fasta_path, 'w') for sample, fastq_url in fetch_study_details(accession): # in the form seqs_000007123.1075697.fastq.gz # and unfortunately, the suffix (1075697) is missing and parts of the # current results processing depend on the suffix. fastq_filename = fastq_url.rsplit('/')[-1] qiimedb_samplename = fastq_filename.split('_')[-1].rsplit('.', 2)[0] md = fetch_metadata_xml(sample) all_md[qiimedb_samplename] = md all_cols.update(md) # write out fasta try: for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)): fasta_path.write(">%s\n%s\n" % (id_, seq)) except: continue header = list(all_cols) md_f.write('#SampleID\t') md_f.write('\t'.join(header)) md_f.write('\n') for sampleid, values in all_md.iteritems(): to_write = [values.get(k, "no_data") for k in header] to_write.insert(0, sampleid) md_f.write('\t'.join(to_write)) md_f.write('\n') md_f.close() fasta_path.close()
def fetch_study(accession, metadata_path, fasta_path): """Fetch and dump a full study Grab and dump a full study """ all_md = {} all_cols = set([]) md_f = open(metadata_path, 'w') fasta_path = open(fasta_path, 'w') for sample, fastq_url in fetch_study_details(accession): # in the form seqs_000007123.1075697.fastq.gz # and unfortunately, the suffix (1075697) is missing and parts of the # current results processing depend on the suffix. fastq_filename = fastq_url.rsplit('/')[-1] qiimedb_samplename = fastq_filename.split('_')[-1].rsplit('.', 2)[0] md = fetch_metadata_xml(sample) all_md[qiimedb_samplename] = md all_cols.update(md) # write out fasta for id_, seq, qual in parse_fastq(fetch_seqs_fastq(fastq_url)): fasta_path.write(">%s\n%s\n" % (id_, seq)) header = list(all_cols) md_f.write('#SampleID\t') md_f.write('\t'.join(header)) md_f.write('\n') for sampleid, values in all_md.iteritems(): to_write = [values.get(k, "no_data") for k in header] to_write.insert(0, sampleid) md_f.write('\t'.join(to_write)) md_f.write('\n') md_f.close() fasta_path.close()
def read_fwd_rev_read(fwd_read_f, rev_read_f, bc_to_sid, barcode_len, barcode_correction_fn, bc_to_fwd_primers, bc_to_rev_primers, max_barcode_errors, fwd_length, rev_length): """ Reads fwd and rev read fastq files Parameters ---------- fwd_read_f: file forward read fastq file rev_read_f: file reverse read fastq file bc_to_sid: dict barcode_len: int barcode length barcode_correction_fn: function applicable only for gloay_12 barcodes bc_to_fwd_primers: dict bc_to_rev_primers: dict max_barcode_errors: int maximum allowable errors in barcodes, applicable for golay_12 fwd_length: int standard length, used for truncating of the forward sequence rev_length: int standard length, used for truncating of the reverse sequence Returns ---------- random_bc_lookup: defaultdict contains sample ID -> random barcode -> list of seqs random_bc_reads: defaultdict contains sample ID -> random barcode -> number of reads random_bcs: list barcode_errors_exceed_max_count: int barcode_not_in_map_count: int primer_mismatch_count: int seq_too_short_count: int input_seqs_count: int total_seqs_kept: int """ random_bc_lookup = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) random_bc_reads = defaultdict(lambda: defaultdict(int)) random_bcs = {} # Counts for Quality Control: input_seqs_count = 0 total_seqs_kept_count = 0 barcode_errors_exceed_max_count = 0 barcode_not_in_map_count = 0 primer_mismatch_count = 0 seq_too_short_count = 0 input_seqs_count = 0 total_seqs_kept = 0 header_idx = 0 seq_idx = 1 qual_idx = 2 for fwd_read, rev_read in izip(parse_fastq(fwd_read_f, strict=False, enforce_qual_range=False), parse_fastq(rev_read_f, strict=False, enforce_qual_range=False)): # confirm match between headers input_seqs_count += 1 if fwd_read[header_idx] != rev_read[header_idx]: raise PairedEndParseError( "Headers of forward and reverse reads " "do not match. Confirm that the forward " "and reverse read fastq files that you " "provided have headers that match one " "another.") else: header = fwd_read[header_idx] fwd_seq = fwd_read[seq_idx] rev_seq = rev_read[seq_idx] # Grab the barcode sequence. It is always at the very end of the # forward read. Strip the barcode from the sequence. barcode = fwd_seq[-barcode_len:] fwd_seq = fwd_seq[:-barcode_len] # Correct the barcode(if applicable) and map to sample ID. num_barcode_errors, corrected_barcode, _, sample_id =\ correct_barcode(barcode, bc_to_sid, barcode_correction_fn) # Skip barcodes with too many errors. if num_barcode_errors > max_barcode_errors: barcode_errors_exceed_max_count += 1 continue if sample_id is None: barcode_not_in_map_count += 1 continue # Extract the random barcode and primer from the forward read. possible_primers = bc_to_fwd_primers[corrected_barcode].keys() try: random_bc, _, clean_fwd_seq = extract_primer(fwd_seq, possible_primers, min_idx=5, max_idx=20) random_bcs[sample_id].append(random_bc) except PrimerMismatchError: primer_mismatch_count += 1 continue except KeyError: random_bcs[sample_id] = list() random_bcs[sample_id].append(random_bc) possible_primers = bc_to_rev_primers[barcode] try: phase_seq, _, clean_rev_seq = extract_primer(rev_seq, possible_primers) except PrimerMismatchError: primer_mismatch_count += 1 continue if len(clean_fwd_seq) < fwd_length: seq_too_short_count += 1 continue clean_fwd_seq = clean_fwd_seq[:fwd_length] clean_rev_seq = clean_rev_seq[:rev_length] total_seqs_kept += 1 random_bc_reads[sample_id][random_bc] += 1 random_bc_lookup[sample_id][random_bc][ (clean_fwd_seq, clean_rev_seq)] += 1 return (random_bc_lookup, random_bc_reads, random_bcs, barcode_errors_exceed_max_count, barcode_not_in_map_count, primer_mismatch_count, seq_too_short_count, input_seqs_count, total_seqs_kept)
def extract_barcodes(fastq1, fastq2=None, output_dir=".", input_type="barcode_single_end", bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, char_delineator=":", switch_bc_order=False, map_fp=None, attempt_read_orientation=False, disable_header_match=False): """ Main program function for extracting barcodes from reads fastq1: Open fastq file 1. fastq2: None or open fastq file 2. output_dir: Directory to write output parses sequences to. input_type: Specifies the type of parsing to be done. bc1_len: Length of barcode 1 to be parsed from fastq1 bc2_len: Length of barcode 2 to be parsed from fastq2, or from end of a stitched read. rev_comp_bc1: If True, reverse complement bc1 before writing. rev_comp_bc2: If True, reverse complement bc2 before writing. char_delineator: Specify character that immediately precedes the barcode for input_type of barcode_in_label. switch_bc_order: Normally, barcode 1 will be written first, followed by barcode 2 in a combined output fastq file. If True, the order will be reversed. Only applies to stitched reads processing, as other barcode orders are dictated by the the parameter chosen for the fastq files. map_fp: open file object of mapping file, requires a LinkerPrimerSequence and ReversePrimer field to be present. Used for orienting reads. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. disable_header_match: if True, suppresses checks between fastq headers. """ # Turn off extra file creation for single read. if input_type == "barcode_single_end" and attempt_read_orientation: attempt_read_orientation = False if attempt_read_orientation: header, mapping_data, run_description, errors, warnings =\ process_id_map(map_fp) forward_primers, reverse_primers = get_primers(header, mapping_data) output_bc_not_oriented = open( join(output_dir, "barcodes_not_oriented.fastq.incomplete"), "w") fastq1_out_not_oriented = open( join(output_dir, "reads1_not_oriented.fastq.incomplete"), "w") fastq2_out_not_oriented = open( join(output_dir, "reads2_not_oriented.fastq.incomplete"), "w") else: forward_primers = None reverse_primers = None output_bc_not_oriented = None fastq1_out_not_oriented = None fastq2_out_not_oriented = None output_bc_fastq = open(join(output_dir, "barcodes.fastq.incomplete"), "w") if input_type in ["barcode_single_end", "barcode_paired_stitched"]: output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w") output_fastq2 = None final_fastq1_name = join(output_dir, "reads.fastq") elif input_type in ["barcode_paired_end"]: output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w") output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w") final_fastq1_name = join(output_dir, "reads1.fastq") else: output_fastq1 = None output_fastq2 = None if not fastq2: fastq2 = cycle(["@", "AAAAAAAAAAAA", "+", "AAAAAAAAAAAA"]) not_paired = True else: not_paired = False check_header_match_f = get_casava_version(fastq1) header_index = 0 for read1_data, read2_data in izip( parse_fastq(fastq1, strict=False, enforce_qual_range=False), parse_fastq(fastq2, strict=False, enforce_qual_range=False)): if not disable_header_match: if not check_header_match_f(read1_data[header_index], read2_data[header_index]): raise FastqParseError( "Headers of read1 and read2 do not match. Can't continue. " "Confirm that the fastq sequences that you are " "passing match one another. --disable_header_match can be " "used to suppress header checks.") if input_type == "barcode_single_end": process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len, rev_comp_bc1) elif input_type == "barcode_paired_end": process_barcode_paired_end_data( read1_data, read2_data, output_bc_fastq, output_fastq1, output_fastq2, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, attempt_read_orientation, forward_primers, reverse_primers, output_bc_not_oriented, fastq1_out_not_oriented, fastq2_out_not_oriented) elif input_type == "barcode_paired_stitched": process_barcode_paired_stitched( read1_data, output_bc_fastq, output_fastq1, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, attempt_read_orientation, forward_primers, reverse_primers, output_bc_not_oriented, fastq1_out_not_oriented, switch_bc_order) elif input_type == "barcode_in_label": if not_paired: curr_read2_data = False else: curr_read2_data = read2_data process_barcode_in_label(read1_data, curr_read2_data, output_bc_fastq, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, char_delineator) output_bc_fastq.close() rename(output_bc_fastq.name, join(output_dir, "barcodes.fastq")) if output_fastq1: output_fastq1.close() rename(output_fastq1.name, final_fastq1_name) if output_fastq2: output_fastq2.close() rename(output_fastq2.name, join(output_dir, "reads2.fastq")) if output_bc_not_oriented: rename(output_bc_not_oriented.name, join(output_dir, "barcodes_not_oriented.fastq")) if fastq1_out_not_oriented: rename(fastq1_out_not_oriented.name, join(output_dir, "reads1_not_oriented.fastq")) if fastq2_out_not_oriented: rename(fastq2_out_not_oriented.name, join(output_dir, "reads2_not_oriented.fastq"))
def extract_barcodes(fastq1, fastq2=None, output_dir=".", input_type="barcode_single_end", bc1_len=6, bc2_len=6, rev_comp_bc1=False, rev_comp_bc2=False, char_delineator=":", switch_bc_order=False, map_fp=None, attempt_read_orientation=False, disable_header_match=False): """ Main program function for extracting barcodes from reads fastq1: Open fastq file 1. fastq2: None or open fastq file 2. output_dir: Directory to write output parses sequences to. input_type: Specifies the type of parsing to be done. bc1_len: Length of barcode 1 to be parsed from fastq1 bc2_len: Length of barcode 2 to be parsed from fastq2, or from end of a stitched read. rev_comp_bc1: If True, reverse complement bc1 before writing. rev_comp_bc2: If True, reverse complement bc2 before writing. char_delineator: Specify character that immediately precedes the barcode for input_type of barcode_in_label. switch_bc_order: Normally, barcode 1 will be written first, followed by barcode 2 in a combined output fastq file. If True, the order will be reversed. Only applies to stitched reads processing, as other barcode orders are dictated by the the parameter chosen for the fastq files. map_fp: open file object of mapping file, requires a LinkerPrimerSequence and ReversePrimer field to be present. Used for orienting reads. attempt_read_orientation: If True, will attempt to orient the reads according to the forward primers in the mapping file. If primer is detected in current orientation, leave the read as is, but if reverse complement is detected (or ReversePrimer is detected in the current orientation) the read will either be written to the forward (read 1) or reverse (read 2) reads for the case of paired files, or the read will be reverse complemented in the case of stitched reads. disable_header_match: if True, suppresses checks between fastq headers. """ # Turn off extra file creation for single read. if input_type == "barcode_single_end" and attempt_read_orientation: attempt_read_orientation = False if attempt_read_orientation: header, mapping_data, run_description, errors, warnings =\ process_id_map(map_fp) forward_primers, reverse_primers = get_primers(header, mapping_data) output_bc_not_oriented = open(join(output_dir, "barcodes_not_oriented.fastq.incomplete"), "w") fastq1_out_not_oriented = open(join(output_dir, "reads1_not_oriented.fastq.incomplete"), "w") fastq2_out_not_oriented = open(join(output_dir, "reads2_not_oriented.fastq.incomplete"), "w") else: forward_primers = None reverse_primers = None output_bc_not_oriented = None fastq1_out_not_oriented = None fastq2_out_not_oriented = None output_bc_fastq = open(join(output_dir, "barcodes.fastq.incomplete"), "w") if input_type in ["barcode_single_end", "barcode_paired_stitched"]: output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w") output_fastq2 = None final_fastq1_name = join(output_dir, "reads.fastq") elif input_type in ["barcode_paired_end"]: output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w") output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w") final_fastq1_name = join(output_dir, "reads1.fastq") else: output_fastq1 = None output_fastq2 = None if not fastq2: fastq2 = cycle(["@", "AAAAAAAAAAAA", "+", "AAAAAAAAAAAA"]) not_paired = True else: not_paired = False check_header_match_f = get_casava_version(fastq1) header_index = 0 for read1_data, read2_data in izip( parse_fastq(fastq1, strict=False, enforce_qual_range=False), parse_fastq(fastq2, strict=False, enforce_qual_range=False)): if not disable_header_match: if not check_header_match_f(read1_data[header_index], read2_data[header_index]): raise FastqParseError("Headers of read1 and read2 do not match. Can't continue. " "Confirm that the fastq sequences that you are " "passing match one another. --disable_header_match can be " "used to suppress header checks.") if input_type == "barcode_single_end": process_barcode_single_end_data(read1_data, output_bc_fastq, output_fastq1, bc1_len, rev_comp_bc1) elif input_type == "barcode_paired_end": process_barcode_paired_end_data(read1_data, read2_data, output_bc_fastq, output_fastq1, output_fastq2, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, attempt_read_orientation, forward_primers, reverse_primers, output_bc_not_oriented, fastq1_out_not_oriented, fastq2_out_not_oriented) elif input_type == "barcode_paired_stitched": process_barcode_paired_stitched(read1_data, output_bc_fastq, output_fastq1, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, attempt_read_orientation, forward_primers, reverse_primers, output_bc_not_oriented, fastq1_out_not_oriented, switch_bc_order) elif input_type == "barcode_in_label": if not_paired: curr_read2_data = False else: curr_read2_data = read2_data process_barcode_in_label(read1_data, curr_read2_data, output_bc_fastq, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2, char_delineator) output_bc_fastq.close() rename(output_bc_fastq.name, join(output_dir, "barcodes.fastq")) if output_fastq1: output_fastq1.close() rename(output_fastq1.name, final_fastq1_name) if output_fastq2: output_fastq2.close() rename(output_fastq2.name, join(output_dir, "reads2.fastq")) if output_bc_not_oriented: rename(output_bc_not_oriented.name, join(output_dir, "barcodes_not_oriented.fastq")) if fastq1_out_not_oriented: rename(fastq1_out_not_oriented.name, join(output_dir, "reads1_not_oriented.fastq")) if fastq2_out_not_oriented: rename(fastq2_out_not_oriented.name, join(output_dir, "reads2_not_oriented.fastq"))
def test_parse_error(self): with self.assertRaises(FastqParseError): list(parse_fastq(self.FASTQ_EXAMPLE_2, strict=True)) with self.assertRaises(FastqParseError): list(parse_fastq(self.FASTQ_EXAMPLE_3, phred_offset=64))
def test_invalid_phred_offset(self): with self.assertRaises(ValueError): list(parse_fastq(self.FASTQ_EXAMPLE, phred_offset=42))
def filter_fastq_fp(input_seqs_fp, output_seqs_fp, seqs_to_keep, negate=False): """Filter a fastq file to include only sequences listed in seqs_to_keep """ input_seqs = parse_fastq(open(input_seqs_fp, 'U'), strict=False) output_f = open(output_seqs_fp, 'w') return filter_fastq(input_seqs, output_f, seqs_to_keep, negate)
def process_fastq_single_end_read_file(fastq_read_f, fastq_barcode_f, barcode_to_sample_id, store_unassigned=False, max_bad_run_length=0, phred_quality_threshold=2, min_per_read_length_fraction=0.75, rev_comp=False, rev_comp_barcode=False, seq_max_N=0, start_seq_id=0, filter_bad_illumina_qual_digit=False, log_f=None, histogram_f=None, barcode_correction_fn=None, max_barcode_errors=1.5, strict_header_match=True, phred_offset=None): """parses fastq single-end read file """ header_index = 0 sequence_index = 1 quality_index = 2 seq_id = start_seq_id # grab the first lines and then seek back to the beginning of the file try: fastq_read_f_line1 = fastq_read_f.readline() fastq_read_f_line2 = fastq_read_f.readline() fastq_read_f.seek(0) except AttributeError: fastq_read_f_line1 = fastq_read_f[0] fastq_read_f_line2 = fastq_read_f[1] if phred_offset is None: post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1) if post_casava_v180: phred_offset = 33 else: phred_offset = 64 if phred_offset == 33: check_header_match_f = check_header_match_180_or_later elif phred_offset == 64: check_header_match_f = check_header_match_pre180 else: raise ValueError("Invalid PHRED offset: %d" % phred_offset) # compute the barcode length, if they are all the same. # this is useful for selecting a subset of the barcode read # if it's too long (e.g., for technical reasons on the sequencer) barcode_lengths = set( [len(bc) for bc, sid in barcode_to_sample_id.items()]) if len(barcode_lengths) == 1: barcode_length = barcode_lengths.pop() else: barcode_length = None # compute the minimum read length as a fraction of the length of the input # read min_per_read_length = min_per_read_length_fraction * \ len(fastq_read_f_line2) # prep data for logging input_sequence_count = 0 count_barcode_not_in_map = 0 count_too_short = 0 count_too_many_N = 0 count_bad_illumina_qual_digit = 0 count_barcode_errors_exceed_max = 0 sequence_lengths = [] seqs_per_sample_counts = {} for bc_data, read_data in izip( parse_fastq(fastq_barcode_f, strict=False, phred_offset=phred_offset), parse_fastq(fastq_read_f, strict=False, phred_offset=phred_offset)): input_sequence_count += 1 # Confirm match between barcode and read headers if strict_header_match and \ (not check_header_match_f(bc_data[header_index], read_data[header_index])): raise FastqParseError( "Headers of barcode and read do not match. Can't continue. " "Confirm that the barcode fastq and read fastq that you are " "passing match one another.") else: header = read_data[header_index] # Grab the barcode sequence if barcode_length: # because thirteen cycles are sometimes used for # techical reasons, this step looks only at the # first tweleve bases. note that the barcode is # rev-comp'ed after this step if requested since # the thirteen base is a technical artefact, not # barcode sequence. barcode = bc_data[sequence_index][:barcode_length] else: barcode = bc_data[sequence_index] if rev_comp_barcode: barcode = str(DNA(barcode).rc()) # Grab the read sequence sequence = read_data[1] # Grab the read quality quality = read_data[2] # correct the barcode (if applicable) and map to sample id num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \ correct_barcode( barcode, barcode_to_sample_id, barcode_correction_fn) # skip samples with too many errors if (num_barcode_errors > max_barcode_errors): count_barcode_errors_exceed_max += 1 continue # skip unassignable samples unless otherwise requested if sample_id is None: if not store_unassigned: count_barcode_not_in_map += 1 continue else: sample_id = 'Unassigned' quality_filter_result, sequence, quality =\ quality_filter_sequence(header, sequence, quality, max_bad_run_length, phred_quality_threshold, min_per_read_length, seq_max_N, filter_bad_illumina_qual_digit) # process quality result if quality_filter_result != 0: # if the quality filter didn't pass record why and # move on to the next record if quality_filter_result == 1: count_too_short += 1 elif quality_filter_result == 2: count_too_many_N += 1 elif quality_filter_result == 3: count_bad_illumina_qual_digit += 1 else: raise ValueError("Unknown quality filter result: %d" % quality_filter_result) continue sequence_lengths.append(len(sequence)) try: seqs_per_sample_counts[sample_id] += 1 except KeyError: seqs_per_sample_counts[sample_id] = 1 if rev_comp: sequence = str(DNA(sequence).rc()) quality = quality[::-1] fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\ (sample_id, seq_id, header, barcode, corrected_barcode, num_barcode_errors) yield fasta_header, sequence, quality, seq_id seq_id += 1 # Add sample IDs with zero counts to dictionary for logging for curr_sample_id in barcode_to_sample_id.values(): if curr_sample_id not in seqs_per_sample_counts.keys(): seqs_per_sample_counts[curr_sample_id] = 0 if log_f is not None: log_str = format_split_libraries_fastq_log( count_barcode_not_in_map, count_too_short, count_too_many_N, count_bad_illumina_qual_digit, count_barcode_errors_exceed_max, input_sequence_count, sequence_lengths, seqs_per_sample_counts) log_f.write(log_str) if len(sequence_lengths) and histogram_f is not None: counts, bin_edges = make_histograms(sequence_lengths) histogram_str = format_histogram_one_count(counts, bin_edges) histogram_f.write(histogram_str) histogram_f.write('\n--\n\n')
def convert_fastaqual(fasta_file_path, output_directory='.', multiple_output_files=False, ascii_increment=33, full_fastq=False, full_fasta_headers=False, per_file_buffer_size=100000): '''Takes a FASTQfile, generates FASTA and QUAL file(s) fasta_file_path: filepath of input FASTQ file. output_directory: Directory to output converted files. multiple_output_files: Make one file per SampleID. ascii_increment: Conversion value for fastq ascii character to numeric quality score. full_fastq: Write labels to both sequence and quality score lines. full_fasta_headers: Retain all data on fasta label, instead of breaking at first whitespace.''' # rename this to avoid confusion... fastq_fp = fasta_file_path # if we are NOT using multiple output files, then open our two (and only) # output files here if not multiple_output_files: fasta_out_fp = get_filename_with_new_ext(fastq_fp, '.fna', output_directory) qual_out_fp = get_filename_with_new_ext(fastq_fp, '.qual', output_directory) fasta_out_f = open(fasta_out_fp, 'w') qual_out_f = open(qual_out_fp, 'w') else: fasta_out_lookup = defaultdict(str) qual_out_lookup = defaultdict(str) fpo = ascii_increment for header, sequence, qual in parse_fastq(open(fastq_fp, 'U'), strict=False, phred_offset=fpo): label = header.split()[0] sample_id = label.split('_')[0] if multiple_output_files: fasta_out_fp = get_filename_with_new_ext(fastq_fp, '_' + sample_id + '.fna', output_directory) qual_out_fp = get_filename_with_new_ext(fastq_fp, '_' + sample_id + '.qual', output_directory) if full_fasta_headers: label = header if (qual < 0).any(): raise ValueError("Output qual scores are negative values. " "Use different ascii_increment value than %s" % str(ascii_increment)) # write QUAL file, 60 qual scores per line qual_record = [">%s\n" % label] for i in range(0, len(qual), 60): qual_record.append(' '.join([str(q) for q in qual[i:i + 60]])) qual_record.append('\n') qual_record = ''.join(qual_record) if multiple_output_files: qual_out_lookup[qual_out_fp] += qual_record else: qual_out_f.write(qual_record) # write FASTA file fasta_record = '>%s\n%s\n' % (label, sequence) if multiple_output_files: fasta_out_lookup[fasta_out_fp] += fasta_record else: fasta_out_f.write(fasta_record) # if we're writing multiple output files, we must close after each # sequeunce write to avoid potentiallyusing up all the OS's filehandles if multiple_output_files: if fasta_out_lookup[fasta_out_fp] >= per_file_buffer_size: fasta_f = open(fasta_out_fp, 'a') fasta_f.write(fasta_out_lookup[fasta_out_fp]) fasta_f.close() fasta_out_lookup[fasta_out_fp] = '' qual_f = open(qual_out_fp, 'a') qual_f.write(qual_out_lookup[qual_out_fp]) qual_f.close() qual_out_lookup[qual_out_fp] = '' # if we have one output file, close it now if multiple_output_files: for fasta_out_fp, records in fasta_out_lookup.iteritems(): if records: fasta_f = open(fasta_out_fp, 'a') fasta_f.write(records) fasta_f.close() for qual_out_fp, records in qual_out_lookup.iteritems(): if records: qual_f = open(qual_out_fp, 'a') qual_f.write(records) qual_f.close() else: fasta_out_f.close() qual_out_f.close()
def filter_fastq_fp(input_seqs_fp, output_seqs_fp, seqs_to_keep, negate=False): """Filter a fastq file to include only sequences listed in seqs_to_keep """ input_seqs = parse_fastq(open(input_seqs_fp, "U"), strict=False) output_f = open(output_seqs_fp, "w") return filter_fastq(input_seqs, output_f, seqs_to_keep, negate)
def process_fastq_single_end_read_file(fastq_read_f, fastq_barcode_f, barcode_to_sample_id, store_unassigned=False, max_bad_run_length=0, phred_quality_threshold=2, min_per_read_length_fraction=0.75, rev_comp=False, rev_comp_barcode=False, seq_max_N=0, start_seq_id=0, filter_bad_illumina_qual_digit=False, log_f=None, histogram_f=None, barcode_correction_fn=None, max_barcode_errors=1.5, strict_header_match=True, phred_to_ascii_f=None): """parses fastq single-end read file """ header_index = 0 sequence_index = 1 quality_index = 2 seq_id = start_seq_id # grab the first lines and then seek back to the beginning of the file try: fastq_read_f_line1 = fastq_read_f.readline() fastq_read_f_line2 = fastq_read_f.readline() fastq_read_f.seek(0) except AttributeError: fastq_read_f_line1 = fastq_read_f[0] fastq_read_f_line2 = fastq_read_f[1] post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1) if post_casava_v180: offset = 33 check_header_match_f = check_header_match_180_or_later else: offset = 64 check_header_match_f = check_header_match_pre180 # compute the barcode length, if they are all the same. # this is useful for selecting a subset of the barcode read # if it's too long (e.g., for technical reasons on the sequencer) barcode_lengths = set([len(bc) for bc, sid in barcode_to_sample_id.items()]) if len(barcode_lengths) == 1: barcode_length = barcode_lengths.pop() else: barcode_length = None # compute the minimum read length as a fraction of the length of the input # read min_per_read_length = min_per_read_length_fraction * \ len(fastq_read_f_line2) # prep data for logging input_sequence_count = 0 count_barcode_not_in_map = 0 count_too_short = 0 count_too_many_N = 0 count_bad_illumina_qual_digit = 0 count_barcode_errors_exceed_max = 0 sequence_lengths = [] seqs_per_sample_counts = {} for bc_data, read_data in izip( parse_fastq(fastq_barcode_f, strict=False, phred_offset=offset), parse_fastq(fastq_read_f, strict=False, phred_offset=offset)): input_sequence_count += 1 # Confirm match between barcode and read headers if strict_header_match and \ (not check_header_match_f(bc_data[header_index], read_data[header_index])): raise FastqParseError("Headers of barcode and read do not match. Can't continue. " "Confirm that the barcode fastq and read fastq that you are " "passing match one another.") else: header = read_data[header_index] # Grab the barcode sequence if barcode_length: # because thirteen cycles are sometimes used for # techical reasons, this step looks only at the # first tweleve bases. note that the barcode is # rev-comp'ed after this step if requested since # the thirteen base is a technical artefact, not # barcode sequence. barcode = bc_data[sequence_index][:barcode_length] else: barcode = bc_data[sequence_index] if rev_comp_barcode: barcode = str(DNA(barcode).rc()) # Grab the read sequence sequence = read_data[1] # Grab the read quality quality = read_data[2] # correct the barcode (if applicable) and map to sample id num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \ correct_barcode( barcode, barcode_to_sample_id, barcode_correction_fn) # skip samples with too many errors if (num_barcode_errors > max_barcode_errors): count_barcode_errors_exceed_max += 1 continue # skip unassignable samples unless otherwise requested if sample_id is None: if not store_unassigned: count_barcode_not_in_map += 1 continue else: sample_id = 'Unassigned' quality_filter_result, sequence, quality =\ quality_filter_sequence(header, sequence, quality, max_bad_run_length, phred_quality_threshold, min_per_read_length, seq_max_N, filter_bad_illumina_qual_digit) # process quality result if quality_filter_result != 0: # if the quality filter didn't pass record why and # move on to the next record if quality_filter_result == 1: count_too_short += 1 elif quality_filter_result == 2: count_too_many_N += 1 elif quality_filter_result == 3: count_bad_illumina_qual_digit += 1 else: raise ValueError( "Unknown quality filter result: %d" % quality_filter_result) continue sequence_lengths.append(len(sequence)) try: seqs_per_sample_counts[sample_id] += 1 except KeyError: seqs_per_sample_counts[sample_id] = 1 if rev_comp: sequence = str(DNA(sequence).rc()) quality = quality[::-1] fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\ (sample_id, seq_id, header, barcode, corrected_barcode, num_barcode_errors) yield fasta_header, sequence, quality, seq_id seq_id += 1 # Add sample IDs with zero counts to dictionary for logging for curr_sample_id in barcode_to_sample_id.values(): if curr_sample_id not in seqs_per_sample_counts.keys(): seqs_per_sample_counts[curr_sample_id] = 0 if log_f is not None: log_str = format_split_libraries_fastq_log(count_barcode_not_in_map, count_too_short, count_too_many_N, count_bad_illumina_qual_digit, count_barcode_errors_exceed_max, input_sequence_count, sequence_lengths, seqs_per_sample_counts) log_f.write(log_str) if len(sequence_lengths) and histogram_f is not None: counts, bin_edges = make_histograms(sequence_lengths) histogram_str = format_histogram_one_count(counts, bin_edges) histogram_f.write(histogram_str) histogram_f.write('\n--\n\n')