def truncate_rev_primers(fasta_f, output_fp, reverse_primers, truncate_option='truncate_only', primer_mismatches=2): """ Locally aligns reverse primers, trucates or removes seqs fasta_f: open file of fasta file output_fp: open filepath to write truncated fasta to reverse_primers: dictionary of SampleID:reverse primer sequence truncate_option: either truncate_only, truncate_remove primer_mismatches: number of allowed primer mismatches """ log_data = { 'sample_id_not_found':0, 'reverse_primer_not_found':0, 'total_seqs':0, 'seqs_written':0 } for label, seq in MinimalFastaParser(fasta_f): curr_label = label.split('_')[0] log_data['total_seqs'] += 1 # Check fasta label for valid SampleID, if not found, just write seq try: curr_rev_primer = reverse_primers[curr_label] except KeyError: log_data['sample_id_not_found'] += 1 output_fp.write('>%s\n%s\n' % (label, seq)) log_data['seqs_written'] += 1 continue mm_tests = {} for rev_primer in curr_rev_primer: rev_primer_mm, rev_primer_index =\ local_align_primer_seq(rev_primer, seq) mm_tests[rev_primer_mm] = rev_primer_index rev_primer_mm = min(mm_tests.keys()) rev_primer_index = mm_tests[rev_primer_mm] if rev_primer_mm > primer_mismatches: if truncate_option == "truncate_remove": log_data['reverse_primer_not_found'] += 1 else: log_data['reverse_primer_not_found'] += 1 log_data['seqs_written'] += 1 output_fp.write('>%s\n%s\n' % (label, seq)) else: # Check for zero seq length after truncation, will not write seq if rev_primer_index > 0: log_data['seqs_written'] += 1 output_fp.write('>%s\n%s\n' % (label, seq[0:rev_primer_index])) return log_data
def truncate_rev_primers(fasta_f, output_fp, reverse_primers, truncate_option='truncate_only', primer_mismatches=2): """ Locally aligns reverse primers, trucates or removes seqs fasta_f: open file of fasta file output_fp: open filepath to write truncated fasta to reverse_primers: dictionary of SampleID:reverse primer sequence truncate_option: either truncate_only, truncate_remove primer_mismatches: number of allowed primer mismatches """ log_data = { 'sample_id_not_found': 0, 'reverse_primer_not_found': 0, 'total_seqs': 0, 'seqs_written': 0 } for label, seq in parse_fasta(fasta_f): curr_label = label.split('_')[0] log_data['total_seqs'] += 1 # Check fasta label for valid SampleID, if not found, just write seq try: curr_rev_primer = reverse_primers[curr_label] except KeyError: log_data['sample_id_not_found'] += 1 output_fp.write('>%s\n%s\n' % (label, seq)) log_data['seqs_written'] += 1 continue mm_tests = {} for rev_primer in curr_rev_primer: rev_primer_mm, rev_primer_index =\ local_align_primer_seq(rev_primer, seq) mm_tests[rev_primer_mm] = rev_primer_index rev_primer_mm = min(mm_tests.keys()) rev_primer_index = mm_tests[rev_primer_mm] if rev_primer_mm > primer_mismatches: if truncate_option == "truncate_remove": log_data['reverse_primer_not_found'] += 1 else: log_data['reverse_primer_not_found'] += 1 log_data['seqs_written'] += 1 output_fp.write('>%s\n%s\n' % (label, seq)) else: # Check for zero seq length after truncation, will not write seq if rev_primer_index > 0: log_data['seqs_written'] += 1 output_fp.write('>%s\n%s\n' % (label, seq[0:rev_primer_index])) return log_data
def strip_primer(seqs, primer, maxmismatch=0, keep_primer=False): '''strips 3 prime primer from sequences in fasta file and returns MinimalFastaParser formatted arrays for stripped and not stripped sequences''' nostripped = [] stripped = [] pri = primer.upper() for head, seq in seqs: RNA = False seq = seq.upper() if 'U' in seq: seq = seq.replace('U', 'T') RNA = True #code adapted from truncate_reverse_primers.py in qiime rev_primer_mm, rev_primer_index =\ local_align_primer_seq(pri, seq) if rev_primer_mm > maxmismatch: nostripped.append((head, seq)) continue if keep_primer: seqnew = seq[:rev_primer_index + len(primer)] else: seqnew = seq[:rev_primer_index] if RNA: seqnew = seqnew.replace('T', 'U') stripped.append((head, seqnew)) #end for return stripped, nostripped