Exemplo n.º 1
0
def _process_roi(roi, samdata, amplicon_ref, reverse_comp=False):
    roi_dict = {'region': roi.position_range}
    range_match = re.search('(\d*)-(\d*)', roi.position_range)
    if not range_match:
        return roi_dict
    start = int(range_match.group(1)) - 1
    end = int(range_match.group(2))
    aa_sequence_counter = Counter()
    nt_sequence_counter = Counter()
    depth = 0
    for read in samdata.fetch(amplicon_ref, start, end):
        rstart = read.reference_start
        if rstart <= start:
            nt_sequence = DNA(
                read.query_alignment_sequence[start - rstart:end - rstart])
            if reverse_comp:
                nt_sequence = nt_sequence.reverse_complement()
            #scikit-bio doesn't support translating degenerate bases currently, so we will just throw out reads with degenerates for now
            if nt_sequence.has_degenerates():
                continue
            aa_sequence = nt_sequence.translate()
            aa_string = str(aa_sequence).replace('*', 'x')
            if aa_string:
                nt_sequence_counter.update([str(nt_sequence)])
                aa_sequence_counter.update([aa_string])
                depth += 1
    if len(aa_sequence_counter) == 0:
        roi_dict['flag'] = "region not found"
        return roi_dict
    aa_consensus = aa_sequence_counter.most_common(1)[0][0]
    nt_consensus = nt_sequence_counter.most_common(1)[0][0]
    num_changes = 0
    reference = roi.aa_sequence
    consensus = aa_consensus
    if roi.nt_sequence:
        reference = roi.nt_sequence
        consensus = nt_consensus
    for i in range(len(reference)):
        if len(consensus) <= i or reference[i] != consensus[i]:
            num_changes += 1
    roi_dict['most_common_aa_sequence'] = aa_consensus
    roi_dict['most_common_nt_sequence'] = nt_consensus
    roi_dict['reference'] = reference
    roi_dict['changes'] = str(num_changes)
    roi_dict['aa_sequence_distribution'] = aa_sequence_counter
    roi_dict['nt_sequence_distribution'] = nt_sequence_counter
    roi_dict['depth'] = str(depth)
    return roi_dict
Exemplo n.º 2
0
def _process_roi(roi, samdata, amplicon_ref, reverse_comp=False):
    roi_dict = {'region':roi.position_range}
    range_match = re.search('(\d*)-(\d*)', roi.position_range)
    if not range_match:
        return roi_dict
    start = int(range_match.group(1)) - 1
    end = int(range_match.group(2))
    aa_sequence_counter = Counter()
    nt_sequence_counter = Counter()
    depth = 0
    for read in samdata.fetch(amplicon_ref, start, end):
        rstart = read.reference_start
        if rstart <= start:
            nt_sequence = DNA(read.query_alignment_sequence[start-rstart:end-rstart])
            if reverse_comp:
                nt_sequence = nt_sequence.reverse_complement()
            #scikit-bio doesn't support translating degenerate bases currently, so we will just throw out reads with degenerates for now
            if nt_sequence.has_degenerates(): 
                continue
            aa_sequence = nt_sequence.translate()
            aa_string = str(aa_sequence).replace('*', 'x')
            if aa_string:
                nt_sequence_counter.update([str(nt_sequence)])
                aa_sequence_counter.update([aa_string])
                depth += 1
    if len(aa_sequence_counter) == 0:
        roi_dict['flag'] = "region not found"
        return roi_dict
    aa_consensus = aa_sequence_counter.most_common(1)[0][0]
    nt_consensus = nt_sequence_counter.most_common(1)[0][0]
    num_changes = 0
    reference = roi.aa_sequence
    consensus = aa_consensus
    if roi.nt_sequence:
        reference = roi.nt_sequence
        consensus = nt_consensus
    for i in range(len(reference)):
        if len(consensus) <= i or reference[i] != consensus[i]:
            num_changes += 1
    roi_dict['most_common_aa_sequence'] = aa_consensus
    roi_dict['most_common_nt_sequence'] = nt_consensus
    roi_dict['reference'] = reference
    roi_dict['changes'] = str(num_changes)
    roi_dict['aa_sequence_distribution'] = aa_sequence_counter
    roi_dict['nt_sequence_distribution'] = nt_sequence_counter
    roi_dict['depth'] = str(depth)
    return roi_dict
Exemplo n.º 3
0
def _process_roi_SMOR(roi, samdata, amplicon_ref, reverse_comp=False):
    from operator import attrgetter
    roi_dict = {'region': roi.position_range}
    range_match = re.search('(\d*)-(\d*)', roi.position_range)
    if not range_match:
        return roi_dict
    start = int(range_match.group(1)) - 1
    end = int(range_match.group(2))
    expected_length = end - start
    aa_sequence_counter = Counter()
    nt_sequence_counter = Counter()
    depth = 0
    reads = iter(
        sorted(samdata.fetch(amplicon_ref, start, end),
               key=attrgetter('query_name')))
    for read, pair in pairwise(reads):
        if read.query_name != pair.query_name:
            continue
        rstart1 = read.reference_start
        rstart2 = pair.reference_start
        alignment_length1 = read.get_overlap(start, end)
        alignment_length2 = pair.get_overlap(start, end)
        #throw out reads that either have gaps in the ROI or don't cover the whole ROI
        if alignment_length1 != expected_length or alignment_length2 != expected_length:
            continue
        if rstart1 <= start:
            qend = qstart = None
            for (qpos, rpos) in read.get_aligned_pairs():
                if rpos == start:
                    qstart = qpos
                if rpos == end:
                    qend = qpos
            #throw out reads with insertions in the ROI
            if not qend or not qstart or qend - qstart != expected_length:
                continue
            nt_sequence = DNA(read.query_alignment_sequence[qstart:qend])
        if rstart2 <= start:
            qend = qstart = None
            for (qpos, rpos) in pair.get_aligned_pairs():
                if rpos == start:
                    qstart = qpos
                if rpos == end:
                    qend = qpos
            #throw out reads with insertions in the ROI
            if not qend or not qstart or qend - qstart != expected_length:
                continue
            nt_sequence2 = DNA(pair.query_alignment_sequence[qstart:qend])
        if nt_sequence != nt_sequence2:
            continue
        else:
            if reverse_comp:
                nt_sequence = nt_sequence.reverse_complement()
            #scikit-bio doesn't support translating degenerate bases currently, so we will just throw out reads with degenerates for now
            if nt_sequence.has_degenerates():
                continue
            aa_sequence = nt_sequence.translate()
            aa_string = str(aa_sequence).replace('*', 'x')
            if aa_string:
                nt_sequence_counter.update([str(nt_sequence)])
                aa_sequence_counter.update([aa_string])
                depth += 1
    if len(aa_sequence_counter) == 0:
        roi_dict['flag'] = "region not found"
        return roi_dict
    aa_consensus = aa_sequence_counter.most_common(1)[0][0]
    nt_consensus = nt_sequence_counter.most_common(1)[0][0]
    num_changes = 0
    reference = roi.aa_sequence
    consensus = aa_consensus
    if roi.nt_sequence:
        reference = roi.nt_sequence
        consensus = nt_consensus
    for i in range(len(reference)):
        if len(consensus) <= i or reference[i] != consensus[i]:
            num_changes += 1
    roi_dict['most_common_aa_sequence'] = aa_consensus
    roi_dict['most_common_nt_sequence'] = nt_consensus
    roi_dict['reference'] = reference
    roi_dict['changes'] = str(num_changes)
    roi_dict['aa_sequence_distribution'] = aa_sequence_counter
    roi_dict['nt_sequence_distribution'] = nt_sequence_counter
    roi_dict['depth'] = str(depth)
    return roi_dict
Exemplo n.º 4
0
def _process_roi(roi, samdata, amplicon_ref, reverse_comp=False):
    roi_dict = {'region': roi.position_range}
    range_match = re.search('(\d*)-(\d*)', roi.position_range)
    if not range_match:
        return roi_dict
    start = int(range_match.group(1)) - 1
    end = int(range_match.group(2))
    expected_length = end - start
    aa_sequence_counter = Counter()
    aa_sequence_counter_temp = Counter()
    nt_sequence_counter = Counter()
    depth = 0
    significant = False
    if not roi.aa_sequence:
        roi.aa_sequence = str(DNA(roi.nt_sequence).translate()).replace(
            '*', 'x')
    for read in samdata.fetch(amplicon_ref, start, end):
        rstart = read.reference_start
        alignment_length = read.get_overlap(start, end)
        #throw out reads that either have gaps in the ROI or don't cover the whole ROI
        if alignment_length != expected_length:
            continue
        if rstart <= start:
            qend = qstart = None
            for (qpos, rpos) in read.get_aligned_pairs():
                if rpos == start:
                    qstart = qpos
                if rpos == end:
                    qend = qpos
            #throw out reads with insertions in the ROI
            if not qend or not qstart or qend - qstart != expected_length:
                continue
            nt_sequence = DNA(read.query_sequence[qstart:qend])
            if reverse_comp:
                nt_sequence = nt_sequence.reverse_complement()
            #scikit-bio doesn't support translating degenerate bases currently, so we will just throw out reads with degenerates for now
            if nt_sequence.has_degenerates():
                continue
            aa_sequence = nt_sequence.translate()
            aa_string = str(aa_sequence).replace('*', 'x')
            if aa_string:
                nt_sequence_counter.update([str(nt_sequence)])
                aa_sequence_counter_temp.update([aa_string])
                depth += 1
    if len(aa_sequence_counter_temp) == 0:
        roi_dict['flag'] = "region not found"
        return roi_dict
    else:
        for (aa_string, count) in aa_sequence_counter_temp.most_common():
            num_changes = 0
            for i in range(len(roi.aa_sequence)):
                if len(aa_string) <= i or roi.aa_sequence[i] != aa_string[i]:
                    num_changes += 1
            aa_sequence_counter[(aa_string, num_changes)] = count

    #This next bit is just being saved for backward compatibility. Should deprecate and remove soon
    (aa_consensus, num_changes) = aa_sequence_counter.most_common(1)[0][0]
    nt_consensus = nt_sequence_counter.most_common(1)[0][0]
    reference = roi.aa_sequence
    consensus = aa_consensus
    if roi.nt_sequence:
        reference = roi.nt_sequence
        consensus = nt_consensus
    roi_dict['most_common_aa_sequence'] = aa_consensus
    roi_dict['most_common_nt_sequence'] = nt_consensus
    roi_dict['reference'] = reference
    roi_dict['changes'] = str(num_changes)
    #End backward compatibility code
    roi_dict['aa_sequence_distribution'] = aa_sequence_counter
    roi_dict['nt_sequence_distribution'] = nt_sequence_counter
    roi_dict['depth'] = str(depth)
    return roi_dict