def _process_roi(roi, samdata, amplicon_ref, reverse_comp=False): roi_dict = {'region': roi.position_range} range_match = re.search('(\d*)-(\d*)', roi.position_range) if not range_match: return roi_dict start = int(range_match.group(1)) - 1 end = int(range_match.group(2)) aa_sequence_counter = Counter() nt_sequence_counter = Counter() depth = 0 for read in samdata.fetch(amplicon_ref, start, end): rstart = read.reference_start if rstart <= start: nt_sequence = DNA( read.query_alignment_sequence[start - rstart:end - rstart]) if reverse_comp: nt_sequence = nt_sequence.reverse_complement() #scikit-bio doesn't support translating degenerate bases currently, so we will just throw out reads with degenerates for now if nt_sequence.has_degenerates(): continue aa_sequence = nt_sequence.translate() aa_string = str(aa_sequence).replace('*', 'x') if aa_string: nt_sequence_counter.update([str(nt_sequence)]) aa_sequence_counter.update([aa_string]) depth += 1 if len(aa_sequence_counter) == 0: roi_dict['flag'] = "region not found" return roi_dict aa_consensus = aa_sequence_counter.most_common(1)[0][0] nt_consensus = nt_sequence_counter.most_common(1)[0][0] num_changes = 0 reference = roi.aa_sequence consensus = aa_consensus if roi.nt_sequence: reference = roi.nt_sequence consensus = nt_consensus for i in range(len(reference)): if len(consensus) <= i or reference[i] != consensus[i]: num_changes += 1 roi_dict['most_common_aa_sequence'] = aa_consensus roi_dict['most_common_nt_sequence'] = nt_consensus roi_dict['reference'] = reference roi_dict['changes'] = str(num_changes) roi_dict['aa_sequence_distribution'] = aa_sequence_counter roi_dict['nt_sequence_distribution'] = nt_sequence_counter roi_dict['depth'] = str(depth) return roi_dict
def _process_roi(roi, samdata, amplicon_ref, reverse_comp=False): roi_dict = {'region':roi.position_range} range_match = re.search('(\d*)-(\d*)', roi.position_range) if not range_match: return roi_dict start = int(range_match.group(1)) - 1 end = int(range_match.group(2)) aa_sequence_counter = Counter() nt_sequence_counter = Counter() depth = 0 for read in samdata.fetch(amplicon_ref, start, end): rstart = read.reference_start if rstart <= start: nt_sequence = DNA(read.query_alignment_sequence[start-rstart:end-rstart]) if reverse_comp: nt_sequence = nt_sequence.reverse_complement() #scikit-bio doesn't support translating degenerate bases currently, so we will just throw out reads with degenerates for now if nt_sequence.has_degenerates(): continue aa_sequence = nt_sequence.translate() aa_string = str(aa_sequence).replace('*', 'x') if aa_string: nt_sequence_counter.update([str(nt_sequence)]) aa_sequence_counter.update([aa_string]) depth += 1 if len(aa_sequence_counter) == 0: roi_dict['flag'] = "region not found" return roi_dict aa_consensus = aa_sequence_counter.most_common(1)[0][0] nt_consensus = nt_sequence_counter.most_common(1)[0][0] num_changes = 0 reference = roi.aa_sequence consensus = aa_consensus if roi.nt_sequence: reference = roi.nt_sequence consensus = nt_consensus for i in range(len(reference)): if len(consensus) <= i or reference[i] != consensus[i]: num_changes += 1 roi_dict['most_common_aa_sequence'] = aa_consensus roi_dict['most_common_nt_sequence'] = nt_consensus roi_dict['reference'] = reference roi_dict['changes'] = str(num_changes) roi_dict['aa_sequence_distribution'] = aa_sequence_counter roi_dict['nt_sequence_distribution'] = nt_sequence_counter roi_dict['depth'] = str(depth) return roi_dict
def _process_roi_SMOR(roi, samdata, amplicon_ref, reverse_comp=False): from operator import attrgetter roi_dict = {'region': roi.position_range} range_match = re.search('(\d*)-(\d*)', roi.position_range) if not range_match: return roi_dict start = int(range_match.group(1)) - 1 end = int(range_match.group(2)) expected_length = end - start aa_sequence_counter = Counter() nt_sequence_counter = Counter() depth = 0 reads = iter( sorted(samdata.fetch(amplicon_ref, start, end), key=attrgetter('query_name'))) for read, pair in pairwise(reads): if read.query_name != pair.query_name: continue rstart1 = read.reference_start rstart2 = pair.reference_start alignment_length1 = read.get_overlap(start, end) alignment_length2 = pair.get_overlap(start, end) #throw out reads that either have gaps in the ROI or don't cover the whole ROI if alignment_length1 != expected_length or alignment_length2 != expected_length: continue if rstart1 <= start: qend = qstart = None for (qpos, rpos) in read.get_aligned_pairs(): if rpos == start: qstart = qpos if rpos == end: qend = qpos #throw out reads with insertions in the ROI if not qend or not qstart or qend - qstart != expected_length: continue nt_sequence = DNA(read.query_alignment_sequence[qstart:qend]) if rstart2 <= start: qend = qstart = None for (qpos, rpos) in pair.get_aligned_pairs(): if rpos == start: qstart = qpos if rpos == end: qend = qpos #throw out reads with insertions in the ROI if not qend or not qstart or qend - qstart != expected_length: continue nt_sequence2 = DNA(pair.query_alignment_sequence[qstart:qend]) if nt_sequence != nt_sequence2: continue else: if reverse_comp: nt_sequence = nt_sequence.reverse_complement() #scikit-bio doesn't support translating degenerate bases currently, so we will just throw out reads with degenerates for now if nt_sequence.has_degenerates(): continue aa_sequence = nt_sequence.translate() aa_string = str(aa_sequence).replace('*', 'x') if aa_string: nt_sequence_counter.update([str(nt_sequence)]) aa_sequence_counter.update([aa_string]) depth += 1 if len(aa_sequence_counter) == 0: roi_dict['flag'] = "region not found" return roi_dict aa_consensus = aa_sequence_counter.most_common(1)[0][0] nt_consensus = nt_sequence_counter.most_common(1)[0][0] num_changes = 0 reference = roi.aa_sequence consensus = aa_consensus if roi.nt_sequence: reference = roi.nt_sequence consensus = nt_consensus for i in range(len(reference)): if len(consensus) <= i or reference[i] != consensus[i]: num_changes += 1 roi_dict['most_common_aa_sequence'] = aa_consensus roi_dict['most_common_nt_sequence'] = nt_consensus roi_dict['reference'] = reference roi_dict['changes'] = str(num_changes) roi_dict['aa_sequence_distribution'] = aa_sequence_counter roi_dict['nt_sequence_distribution'] = nt_sequence_counter roi_dict['depth'] = str(depth) return roi_dict
def _process_roi(roi, samdata, amplicon_ref, reverse_comp=False): roi_dict = {'region': roi.position_range} range_match = re.search('(\d*)-(\d*)', roi.position_range) if not range_match: return roi_dict start = int(range_match.group(1)) - 1 end = int(range_match.group(2)) expected_length = end - start aa_sequence_counter = Counter() aa_sequence_counter_temp = Counter() nt_sequence_counter = Counter() depth = 0 significant = False if not roi.aa_sequence: roi.aa_sequence = str(DNA(roi.nt_sequence).translate()).replace( '*', 'x') for read in samdata.fetch(amplicon_ref, start, end): rstart = read.reference_start alignment_length = read.get_overlap(start, end) #throw out reads that either have gaps in the ROI or don't cover the whole ROI if alignment_length != expected_length: continue if rstart <= start: qend = qstart = None for (qpos, rpos) in read.get_aligned_pairs(): if rpos == start: qstart = qpos if rpos == end: qend = qpos #throw out reads with insertions in the ROI if not qend or not qstart or qend - qstart != expected_length: continue nt_sequence = DNA(read.query_sequence[qstart:qend]) if reverse_comp: nt_sequence = nt_sequence.reverse_complement() #scikit-bio doesn't support translating degenerate bases currently, so we will just throw out reads with degenerates for now if nt_sequence.has_degenerates(): continue aa_sequence = nt_sequence.translate() aa_string = str(aa_sequence).replace('*', 'x') if aa_string: nt_sequence_counter.update([str(nt_sequence)]) aa_sequence_counter_temp.update([aa_string]) depth += 1 if len(aa_sequence_counter_temp) == 0: roi_dict['flag'] = "region not found" return roi_dict else: for (aa_string, count) in aa_sequence_counter_temp.most_common(): num_changes = 0 for i in range(len(roi.aa_sequence)): if len(aa_string) <= i or roi.aa_sequence[i] != aa_string[i]: num_changes += 1 aa_sequence_counter[(aa_string, num_changes)] = count #This next bit is just being saved for backward compatibility. Should deprecate and remove soon (aa_consensus, num_changes) = aa_sequence_counter.most_common(1)[0][0] nt_consensus = nt_sequence_counter.most_common(1)[0][0] reference = roi.aa_sequence consensus = aa_consensus if roi.nt_sequence: reference = roi.nt_sequence consensus = nt_consensus roi_dict['most_common_aa_sequence'] = aa_consensus roi_dict['most_common_nt_sequence'] = nt_consensus roi_dict['reference'] = reference roi_dict['changes'] = str(num_changes) #End backward compatibility code roi_dict['aa_sequence_distribution'] = aa_sequence_counter roi_dict['nt_sequence_distribution'] = nt_sequence_counter roi_dict['depth'] = str(depth) return roi_dict