def map_sequences(): contigs = bioio.load(contigsfile) rseq = bioio.load(args.reffile) for contig in contigs: # map contig to ref sequence start, end, mismatch, _, _ = map_sequence(contig, ref, max_mismatch) if start < 0: contig = funcs.reverse_complemented(contig) start, end, mismatch, _, _ = map_sequence(contig, ref, max_mismatch) if start < 0: continue
def reverse_complemented(self): return self.clone().set_sequence(funcs.reverse_complemented(self.seq))
def reverse_complement(self): self.seq = funcs.reverse_complemented(self.seq) self._reverse_complemented = False if self._reverse_complemented else True return self
def reverse_complement(self): m = self.model() if isinstance(m.selection(), LabelSelection): m.selection().apply( lambda x: x.set_sequence(funcs.reverse_complemented(x)) ) else: m.apply( lambda x: x.set_sequence(funcs.reverse_complemented(x) ) )
def recircularize_sequence(seq, ref, max_mismatch): revcomp = funcs.reverse_complemented(seq) ref_start, ref_end, query_start, query_end, arseq, aqseq, score = align_ref( ref, seq, max_mismatch) #print(arseq) #print(aqseq) ref_start2, ref_end2, query_start2, query_end2, arseq2, aqseq2, score2 = align_ref( ref, revcomp, max_mismatch) #print(arseq2) #print(aqseq2) if score2 > score: cerr('-> use reverse complement seq') ref_start, ref_end = ref_start2, ref_end2 query_start, query_end = query_start2, query_end2 seq = revcomp arseq = arseq2 aqseq = aqseq2 # starting from here, seq = original query seq, qseq, aqseq = aligned query seq circularized_seq = aqseq[ref_start:ref_end + 1] upstream = downstream = '' print(arseq) print(aqseq) if ref_start > query_start: # sequence has headings upstream = aqseq[0:ref_start] if ref_end < query_end: # sequence has downstream / tail downstream = aqseq[ref_end + 1:] #print('boundary ->', aqseq[ref_end-5:ref_end+5]) print('ustream ->', upstream) print('dstream ->', downstream) arseq = arseq[ref_start:ref_end + 1] if len(upstream) > 15: merged_1, merged_2, _ = funcs.align([arseq, upstream], degap=False) print('merged_1 >< merged_2 >< circularized_seq') print(merged_1) print(merged_2) print(circularized_seq) circularized_seq = funcs.merged([circularized_seq, merged_2]) if len(downstream) > 15: #print('ref >< circularized_seq') #print(arseq) #print(circularized_seq) merged_3, merged_4, _ = funcs.align( [funcs.degapped(arseq), downstream], degap=False) print('merged_3 >< merged_4') print(merged_3) print(merged_4) merged_5, merged_6, _ = funcs.align( [merged_3, funcs.degapped(circularized_seq)], degap=False) print('merged_5 >< merged_6') print(merged_5) print(merged_6) print('circ >< merged_4 >< merged_6') print(circularized_seq) print(merged_4) print(merged_6) circularized_seq = funcs.merged([merged_4, merged_6]) #print('ref >< circularized_seq') #print(ref) #print(circularized_seq) return circularized_seq
def recircularize_sequence(seq, ref, match_len=30, max_mismatch=0.9): """ recircularize a circular DNA, based on ref return the recircularized sequence """ # first, map ref to seq to find head position(s) head_start, _, _, _, _ = map_sequence(ref[:match_len], seq, max_mismatch) if head_start < 0: seq = funcs.reverse_complemented(seq) head_start, _, _, _, _ = map_sequence(ref[:match_len], seq, max_mismatch) if head_start < 0: cerr('>>> head not found!') return seq head_start2 = -1 if len(seq) - head_start >= len(ref): # remaining seq is longer than ref, possibly 2nd head pos exists # just pass half of ref offset = head_start + match_len head_start2, _, _, _, _ = map_sequence(ref[:match_len], seq[offset:], max_mismatch) if head_start2 >= 0: head_start2 += offset #print(head_start, head_start2) if head_start2 > head_start: # this part deals in case where the full sequence appear in the seq circularized_seq = seq[head_start:head_start2] # create merged with seq[:head_pos] and seq[head_pos2:] upstream_part = seq[head_start2:] downstream_part = seq[:head_start] if len(upstream_part) > 0: merged_1, merged_2, _ = funcs.align( [circularized_seq, upstream_part]) circularized_seq = funcs.merged([merged_1, merged_2]) if len(downstream_part) > 0: merged_3, merged_4, _ = funcs.align( [circularized_seq, downstream_part]) circularized_seq = funcs.merged([merged_3, merged_4]) else: upstream_part = seq[head_start:] downstream_part = seq[:head_start] overlap_len = len(seq) - len(ref) + 15 overlap_len = min(overlap_len, len(upstream_part), len(downstream_part)) if overlap_len == 0: return upstream_part + downstream_part # map the end of upstream_part to ref #print( upstream_part[-overlap_len:] ) #print( downstream_part[:overlap_len] ) overlap_start1, overlap_end1, mismatch1, ins1, dels1 = map_sequence( upstream_part[-overlap_len:], ref) overlap_start2, overlap_end2, mismatch2, ins2, dels2 = map_sequence( downstream_part[:overlap_len], ref) #print(overlap_start1, overlap_end1) #print(overlap_start2, overlap_end2) if overlap_start2 >= overlap_end1: cerr('>> algorithm problem for this sample!') return seq upstream_tobe_merged, downstream_tobe_merged, score = funcs.align([ upstream_part[overlap_start2:overlap_end1], downstream_part[:overlap_end1 - overlap_start2] ]) merged_segment = funcs.merged( [upstream_tobe_merged, downstream_tobe_merged]) circularized_seq = upstream_part[: overlap_start2] + merged_segment + downstream_part[ overlap_end1 - overlap_start2:] return circularized_seq