def map_sequence(short_seq, long_seq, max_mismatch=0.9): """ map the short_seq to long_seq, return (start_pos, end_pos, inserts, deletions) """ min_score = len(short_seq) * max_mismatch * 0.75 (sseq, lseq, score) = funcs.align([short_seq, long_seq]) gap = ord('-') start_pos = -1 for i in range(len(sseq)): if sseq[i] != gap: start_pos = i break end_pos = -1 for i in range(len(sseq) - 1, 0, -1): if sseq[i] != gap: end_pos = i break if start_pos == end_pos: raise RuntimeError() inserts = lseq[start_pos:end_pos].count(gap) dels = sseq[start_pos:end_pos].count(gap) mismatch = 0 for i in range(start_pos, end_pos): if lseq[i] != sseq[i]: mismatch += 1 if mismatch / (end_pos - start_pos) > max_mismatch: return (-1, -1, mismatch, -1, -1) return (start_pos, end_pos, mismatch, inserts, dels)
def align(self, indexes, method=None, matrix=None): if matrix is None: if self.type() in [DNA, RNA]: matrix = 'DNA' else: matrix = 'BLOSUM62' src = [self[idx] for idx in indexes] from seqpy.core import funcs results = funcs.align(src, method, matrix) for idx, r in zip(indexes, results): self[idx].set_sequence(r)
def align_ref(ref, query, max_mismatch=0.5): """ return: ref_start -> position where the reference start aligned ref_end -> position where the reference stop aligned """ (rseq, qseq, score) = funcs.align([ref, query]) ref_start, ref_end = start_end_pos(rseq) query_start, query_end = start_end_pos(qseq) return ref_start, ref_end, query_start, query_end, rseq, qseq, score / min( ref_end - query_start, query_end - ref_start)
def recircularize_sequence(seq, ref, max_mismatch): revcomp = funcs.reverse_complemented(seq) ref_start, ref_end, query_start, query_end, arseq, aqseq, score = align_ref( ref, seq, max_mismatch) #print(arseq) #print(aqseq) ref_start2, ref_end2, query_start2, query_end2, arseq2, aqseq2, score2 = align_ref( ref, revcomp, max_mismatch) #print(arseq2) #print(aqseq2) if score2 > score: cerr('-> use reverse complement seq') ref_start, ref_end = ref_start2, ref_end2 query_start, query_end = query_start2, query_end2 seq = revcomp arseq = arseq2 aqseq = aqseq2 # starting from here, seq = original query seq, qseq, aqseq = aligned query seq circularized_seq = aqseq[ref_start:ref_end + 1] upstream = downstream = '' print(arseq) print(aqseq) if ref_start > query_start: # sequence has headings upstream = aqseq[0:ref_start] if ref_end < query_end: # sequence has downstream / tail downstream = aqseq[ref_end + 1:] #print('boundary ->', aqseq[ref_end-5:ref_end+5]) print('ustream ->', upstream) print('dstream ->', downstream) arseq = arseq[ref_start:ref_end + 1] if len(upstream) > 15: merged_1, merged_2, _ = funcs.align([arseq, upstream], degap=False) print('merged_1 >< merged_2 >< circularized_seq') print(merged_1) print(merged_2) print(circularized_seq) circularized_seq = funcs.merged([circularized_seq, merged_2]) if len(downstream) > 15: #print('ref >< circularized_seq') #print(arseq) #print(circularized_seq) merged_3, merged_4, _ = funcs.align( [funcs.degapped(arseq), downstream], degap=False) print('merged_3 >< merged_4') print(merged_3) print(merged_4) merged_5, merged_6, _ = funcs.align( [merged_3, funcs.degapped(circularized_seq)], degap=False) print('merged_5 >< merged_6') print(merged_5) print(merged_6) print('circ >< merged_4 >< merged_6') print(circularized_seq) print(merged_4) print(merged_6) circularized_seq = funcs.merged([merged_4, merged_6]) #print('ref >< circularized_seq') #print(ref) #print(circularized_seq) return circularized_seq
def recircularize_sequence(seq, ref, match_len=30, max_mismatch=0.9): """ recircularize a circular DNA, based on ref return the recircularized sequence """ # first, map ref to seq to find head position(s) head_start, _, _, _, _ = map_sequence(ref[:match_len], seq, max_mismatch) if head_start < 0: seq = funcs.reverse_complemented(seq) head_start, _, _, _, _ = map_sequence(ref[:match_len], seq, max_mismatch) if head_start < 0: cerr('>>> head not found!') return seq head_start2 = -1 if len(seq) - head_start >= len(ref): # remaining seq is longer than ref, possibly 2nd head pos exists # just pass half of ref offset = head_start + match_len head_start2, _, _, _, _ = map_sequence(ref[:match_len], seq[offset:], max_mismatch) if head_start2 >= 0: head_start2 += offset #print(head_start, head_start2) if head_start2 > head_start: # this part deals in case where the full sequence appear in the seq circularized_seq = seq[head_start:head_start2] # create merged with seq[:head_pos] and seq[head_pos2:] upstream_part = seq[head_start2:] downstream_part = seq[:head_start] if len(upstream_part) > 0: merged_1, merged_2, _ = funcs.align( [circularized_seq, upstream_part]) circularized_seq = funcs.merged([merged_1, merged_2]) if len(downstream_part) > 0: merged_3, merged_4, _ = funcs.align( [circularized_seq, downstream_part]) circularized_seq = funcs.merged([merged_3, merged_4]) else: upstream_part = seq[head_start:] downstream_part = seq[:head_start] overlap_len = len(seq) - len(ref) + 15 overlap_len = min(overlap_len, len(upstream_part), len(downstream_part)) if overlap_len == 0: return upstream_part + downstream_part # map the end of upstream_part to ref #print( upstream_part[-overlap_len:] ) #print( downstream_part[:overlap_len] ) overlap_start1, overlap_end1, mismatch1, ins1, dels1 = map_sequence( upstream_part[-overlap_len:], ref) overlap_start2, overlap_end2, mismatch2, ins2, dels2 = map_sequence( downstream_part[:overlap_len], ref) #print(overlap_start1, overlap_end1) #print(overlap_start2, overlap_end2) if overlap_start2 >= overlap_end1: cerr('>> algorithm problem for this sample!') return seq upstream_tobe_merged, downstream_tobe_merged, score = funcs.align([ upstream_part[overlap_start2:overlap_end1], downstream_part[:overlap_end1 - overlap_start2] ]) merged_segment = funcs.merged( [upstream_tobe_merged, downstream_tobe_merged]) circularized_seq = upstream_part[: overlap_start2] + merged_segment + downstream_part[ overlap_end1 - overlap_start2:] return circularized_seq