def test_correct_barcode_golay_correction(self): """correct_barcode functions as expected w golay correction""" barcode = "GGAGACAAGGGT" barcode_to_sample_id = { "GGAGACAAGGGA":"s1", "ACACCTGGTGAT":"s2"} correction_fn = decode_golay_12 actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn) expected = (1, "GGAGACAAGGGA", True, "s1") self.assertEqual(actual,expected) barcode = "ACACCTGGTGAC" actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn) expected = (1, "ACACCTGGTGAT", True, "s2") self.assertEqual(actual,expected) # valid code, but not in barcode_to_sample_id map barcode = "CCAGTGTATGCA" actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn) expected = (0, "CCAGTGTATGCA", True, None) self.assertEqual(actual,expected) # invalid code, corrected not in barcode_to_sample_id map barcode = "CCTGTGTATGCA" actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn) expected = (1, "CCAGTGTATGCA", True, None) self.assertEqual(actual,expected)
def test_correct_barcode_exact_match(self): """correct_barcode functions as expected w exact match""" barcode = "GGAGACAAGGGA" barcode_to_sample_id = { "GGAGACAAGGGA":"s1", "ACACCTGGTGAT":"s2"} correction_fn=None actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn) expected = (0, barcode, False, 's1') self.assertEqual(actual,expected) correction_fn = decode_golay_12 actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn) expected = (0, barcode, False, 's1') self.assertEqual(actual,expected)
def test_correct_barcode_no_error_correction(self): """correct_barcode functions as expected w no error correction""" barcode = "GGAGACAAGGGT" barcode_to_sample_id = { "GGAGACAAGGGA":"s1", "ACACCTGGTGAT":"s2"} correction_fn=None actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn) expected = (0, barcode, False, None) self.assertEqual(actual,expected) # barcode contains N barcode = "CCAGTGTANGCA" actual = correct_barcode(barcode,barcode_to_sample_id,correction_fn) expected = (0, "CCAGTGTANGCA", False, None) self.assertEqual(actual,expected)
def read_fwd_rev_read(fwd_read_f, rev_read_f, bc_to_sid, barcode_len, barcode_correction_fn, bc_to_fwd_primers, bc_to_rev_primers, max_barcode_errors, fwd_length, rev_length): """ Reads fwd and rev read fastq files Parameters ---------- fwd_read_f: file forward read fastq file rev_read_f: file reverse read fastq file bc_to_sid: dict barcode_len: int barcode length barcode_correction_fn: function applicable only for gloay_12 barcodes bc_to_fwd_primers: dict bc_to_rev_primers: dict max_barcode_errors: int maximum allowable errors in barcodes, applicable for golay_12 fwd_length: int standard length, used for truncating of the forward sequence rev_length: int standard length, used for truncating of the reverse sequence Returns ---------- random_bc_lookup: defaultdict contains sample ID -> random barcode -> list of seqs random_bc_reads: defaultdict contains sample ID -> random barcode -> number of reads random_bcs: list barcode_errors_exceed_max_count: int barcode_not_in_map_count: int primer_mismatch_count: int seq_too_short_count: int input_seqs_count: int total_seqs_kept: int """ random_bc_lookup = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) random_bc_reads = defaultdict(lambda: defaultdict(int)) random_bcs = {} # Counts for Quality Control: input_seqs_count = 0 total_seqs_kept_count = 0 barcode_errors_exceed_max_count = 0 barcode_not_in_map_count = 0 primer_mismatch_count = 0 seq_too_short_count = 0 input_seqs_count = 0 total_seqs_kept = 0 header_idx = 0 seq_idx = 1 qual_idx = 2 for fwd_read, rev_read in izip(parse_fastq(fwd_read_f, strict=False, enforce_qual_range=False), parse_fastq(rev_read_f, strict=False, enforce_qual_range=False)): # confirm match between headers input_seqs_count += 1 if fwd_read[header_idx] != rev_read[header_idx]: raise PairedEndParseError( "Headers of forward and reverse reads " "do not match. Confirm that the forward " "and reverse read fastq files that you " "provided have headers that match one " "another.") else: header = fwd_read[header_idx] fwd_seq = fwd_read[seq_idx] rev_seq = rev_read[seq_idx] # Grab the barcode sequence. It is always at the very end of the # forward read. Strip the barcode from the sequence. barcode = fwd_seq[-barcode_len:] fwd_seq = fwd_seq[:-barcode_len] # Correct the barcode(if applicable) and map to sample ID. num_barcode_errors, corrected_barcode, _, sample_id =\ correct_barcode(barcode, bc_to_sid, barcode_correction_fn) # Skip barcodes with too many errors. if num_barcode_errors > max_barcode_errors: barcode_errors_exceed_max_count += 1 continue if sample_id is None: barcode_not_in_map_count += 1 continue # Extract the random barcode and primer from the forward read. possible_primers = bc_to_fwd_primers[corrected_barcode].keys() try: random_bc, _, clean_fwd_seq = extract_primer(fwd_seq, possible_primers, min_idx=5, max_idx=20) random_bcs[sample_id].append(random_bc) except PrimerMismatchError: primer_mismatch_count += 1 continue except KeyError: random_bcs[sample_id] = list() random_bcs[sample_id].append(random_bc) possible_primers = bc_to_rev_primers[barcode] try: phase_seq, _, clean_rev_seq = extract_primer(rev_seq, possible_primers) except PrimerMismatchError: primer_mismatch_count += 1 continue if len(clean_fwd_seq) < fwd_length: seq_too_short_count += 1 continue clean_fwd_seq = clean_fwd_seq[:fwd_length] clean_rev_seq = clean_rev_seq[:rev_length] total_seqs_kept += 1 random_bc_reads[sample_id][random_bc] += 1 random_bc_lookup[sample_id][random_bc][ (clean_fwd_seq, clean_rev_seq)] += 1 return (random_bc_lookup, random_bc_reads, random_bcs, barcode_errors_exceed_max_count, barcode_not_in_map_count, primer_mismatch_count, seq_too_short_count, input_seqs_count, total_seqs_kept)