def get_map_overlap(sample, fr1, fr2): '''Get a coordinate map of the overlap between the two fragments''' import numpy as np from seqanpy import align_ladder seq1 = sample.get_reference(fr1) seq2 = sample.get_reference(fr2) (score, ali1, ali2) = align_ladder(seq1, seq2, score_gapopen=-20) start2 = len(ali2) - len(ali2.lstrip('-')) end1 = len(ali1.rstrip('-')) mapco = [] pos1 = start2 pos2 = 0 for i in xrange(start2, end1): if (ali1[i] != '-') and (ali2[i] != '-'): mapco.append((pos1, pos2)) if ali1[i] != '-': pos1 += 1 if ali2[i] != '-': pos2 += 1 return np.array(mapco, int)
def join_block_to_consensus(consensus, cons_block, VERBOSE=0, deltamax=60): '''Join a new block to an extant consensus''' import numpy as np from seqanpy import align_ladder (score, ali1, ali2) = align_ladder(consensus, cons_block, score_gapopen=-10) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali([ali1, ali2], name1='consensus', name2='new block') # In very rare occasions (coverage holes), the second sequence is actually # shorter than the first, then we do not need to glue it in if ali2[-1] == '-': if VERBOSE >= 2: print 'WARNING: the old block is longer than the new one (maybe low coverage)' return consensus end1 = len(ali1.rstrip('-')) start2 = len(ali2) - len(ali2.lstrip('-')) scoremax = 3 * (end1 - start2) delta = scoremax - score if delta > deltamax: raise ValueError('Too many mismatches in neighbouring local consensi! ('+str(delta)+', max '+str(deltamax)+')') consensus = (ali1[:start2] + ali2[start2:]).replace('-', '') return consensus
def align_fragments(c1, c2, VERBOSE=0): '''Align subsequence fragments''' import numpy as np from seqanpy import align_ladder from hivwholeseq.utils.sequence import pretty_print_pairwise_ali (score, a1, a2) = align_ladder(c1, c2, score_gapopen=-20) start2 = len(a2) - len(a2.lstrip('-')) end1 = len(a1.rstrip('-')) a1 = a1[start2: end1] a2 = a2[start2: end1] if VERBOSE >= 3: pretty_print_pairwise_ali((a1, a2), width=100, name1=fr1, name2=fr2) a1 = np.fromstring(a1, 'S1') a2 = np.fromstring(a2, 'S1') co1 = (a1 != '-').cumsum() - 1 co2 = (a2 != '-').cumsum() - 1 ind = (a1 != '-') & (a2 != '-') pos1 = co1[ind] + start2 pos2 = co2[ind] return (pos1, pos2)
def join_block_to_consensus(consensus, cons_block, VERBOSE=0, deltamax=60): '''Join a new block to an extant consensus''' import numpy as np from seqanpy import align_ladder (score, ali1, ali2) = align_ladder(consensus, cons_block, score_gapopen=-10) if VERBOSE >= 3: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali([ali1, ali2], name1='consensus', name2='new block') # In very rare occasions (coverage holes), the second sequence is actually # shorter than the first, then we do not need to glue it in if ali2[-1] == '-': if VERBOSE >= 2: print 'WARNING: the old block is longer than the new one (maybe low coverage)' return consensus end1 = len(ali1.rstrip('-')) start2 = len(ali2) - len(ali2.lstrip('-')) scoremax = 3 * (end1 - start2) delta = scoremax - score if delta > deltamax: raise ValueError( 'Too many mismatches in neighbouring local consensi! (' + str(delta) + ', max ' + str(deltamax) + ')') consensus = (ali1[:start2] + ali2[start2:]).replace('-', '') return consensus
def merge_sequences(seqs, skip_initial=30, accept_gaps=False, VERBOSE=0): '''Merge sequences with overlaps Parameters: seqs (list): sequences to merge skip_initial (int): trim from the beginning of overlaps because we do not really trust those bases accept_gaps (bool): accept gaps in the overlaps ''' from itertools import izip from seqanpy import align_ladder import numpy as np seqs = map(''.join, seqs) left_trim = 0 seqs_all = [] for iov, (seq1, seq2) in enumerate(izip(seqs[:-1], seqs[1:])): if VERBOSE >= 1: print 'Overlap n', iov+1 (score, ali1, ali2) = align_ladder(seq1[left_trim:], seq2, score_gapopen=-20) start2 = len(ali2) - len(ali2.lstrip('-')) end1 = len(ali1.rstrip('-')) # Append first sequence until overlap seqs_all.append(ali1[:start2 + skip_initial]) # Check overlap ov1 = ali1[start2 + skip_initial: end1 - skip_initial] ov2 = ali2[start2 + skip_initial: end1 - skip_initial] if VERBOSE >= 2: from hivwholeseq.utils.sequence import pretty_print_pairwise_ali pretty_print_pairwise_ali((ov1, ov2), width=100, name1='seq1', name2='seq2') if (not accept_gaps) and (('-' in ov1) or ('-' in ov2)): raise ValueError('Gaps in the overlap n. '+str(iov+1)) # Trust the first sequence until half, then the other one i_mid = len(ov1) // 2 seqs_all.append(ov1[:i_mid]) seqs_all.append(ov2[i_mid:]) # Set the left trim for the trailing sequence left_trim = len(ali2[: end1 - skip_initial].replace('-', '')) if VERBOSE >= 1: print 'Add last sequence' seqs_all.append(seq2[left_trim:]) return ''.join(seqs_all)
def merge_read_pair(seq1, seq2): '''Merge two reads of a pair, assuming the second starts later''' from seqanpy import align_ladder (score, ali1, ali2) = align_ladder(seq1, seq2, score_gapopen=-20) end1 = len(ali1.rstrip('-')) start2 = len(ali2) - len(ali2.lstrip('-')) overlap_ali = np.vstack([np.fromstring(a[start2: end1], 'S1') for a in (ali1, ali2)]) overlap = overlap_ali[0] overlap[overlap_ali[0] != overlap_ali[1]] = 'N' overlap = overlap.tostring() seq = ali1[:start2] + overlap + ali2[end1:] return seq
def merge_read_pair(seq1, seq2): '''Merge two reads of a pair, assuming the second starts later''' from seqanpy import align_ladder (score, ali1, ali2) = align_ladder(seq1, seq2, score_gapopen=-20) end1 = len(ali1.rstrip('-')) start2 = len(ali2) - len(ali2.lstrip('-')) overlap_ali = np.vstack( [np.fromstring(a[start2:end1], 'S1') for a in (ali1, ali2)]) overlap = overlap_ali[0] overlap[overlap_ali[0] != overlap_ali[1]] = 'N' overlap = overlap.tostring() seq = ali1[:start2] + overlap + ali2[end1:] return seq
def check_reference_overlap(p, VERBOSE=0): '''Check whether the reference from the various fragments overlap correctly''' from seqanpy import align_ladder from hivwholeseq.utils.sequence import pretty_print_pairwise_ali fragments = ['F' + str(i + 1) for i in xrange(6)] title = 'Overlaps' line = ('{:<' + str(title_len) + '}').format(title + ':') stati = [] for i in xrange(len(fragments) - 1): ref1 = p.get_reference(fragments[i]) ref2 = p.get_reference(fragments[i + 1]) (score, ali1, ali2) = align_ladder(ref1, ref2, score_gapopen=-10, score_gapext=-1) start2 = len(ali2) - len(ali2.lstrip('-')) end1 = len(ali1.rstrip('-')) if VERBOSE >= 4: pretty_print_pairwise_ali((ali1[start2:end1], ali2[start2:end1]), name1=fragments[i], name2=fragments[i + 1], width=100) if ali1[start2:end1].count('-') == ali2[start2:end1].count('-'): status = 'OK' else: status = 'GAPS' import ipdb ipdb.set_trace() line = line+fragments[i]+': '+\ ('{:>'+str(cell_len - len(fragments[i]) - 1)+'}').format(status)+' ' stati.append(status) print line if 'GAPS' in stati: raise ValueError('GAPS status found')
def check_reference_overlap(p, VERBOSE=0): '''Check whether the reference from the various fragments overlap correctly''' from seqanpy import align_ladder from hivwholeseq.utils.sequence import pretty_print_pairwise_ali fragments = ['F'+str(i+1) for i in xrange(6)] title = 'Overlaps' line = ('{:<'+str(title_len)+'}').format(title+':') stati = [] for i in xrange(len(fragments) - 1): ref1 = p.get_reference(fragments[i]) ref2 = p.get_reference(fragments[i+1]) (score, ali1, ali2) = align_ladder(ref1, ref2, score_gapopen=-10, score_gapext=-1) start2 = len(ali2) - len(ali2.lstrip('-')) end1 = len(ali1.rstrip('-')) if VERBOSE >= 4: pretty_print_pairwise_ali((ali1[start2: end1], ali2[start2: end1]), name1=fragments[i], name2=fragments[i+1], width=100) if ali1[start2: end1].count('-') == ali2[start2: end1].count('-'): status = 'OK' else: status = 'GAPS' import ipdb; ipdb.set_trace() line = line+fragments[i]+': '+\ ('{:>'+str(cell_len - len(fragments[i]) - 1)+'}').format(status)+' ' stati.append(status) print line if 'GAPS' in stati: raise ValueError('GAPS status found')
def merge_sequences_fragments(seqs, VERBOSE=0): '''Merge sequences from consecutive fragments''' from seqanpy import align_ladder seqs = map(''.join, seqs) seq = [seqs[0]] for seq2 in seqs[1:]: seq1 = seq[-1] (score, ali1, ali2) = align_ladder(seq1, seq2, score_gapopen=-20) start2 = len(ali2) - len(ali2.lstrip('-')) end1 = len(ali1.rstrip('-')) len_overlap = end1 - start2 # Trust the first sequence in the first half, the second in the second overlap = ali1[start2: start2 + len_overlap / 2] + \ ali2[start2 + len_overlap / 2: end1] seq[-1] = ali1[:start2] seq.append(overlap) seq.append(ali2[end1:]) seq = ''.join(seq) return seq
def merge_fragments(sequences, name='', VERBOSE=0): '''Merge references at overlapping pairs''' from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet.IUPAC import ambiguous_dna from seqanpy import align_ladder from hivwholeseq.utils.sequence import pretty_print_pairwise_ali consensus = [] seq_old = ''.join(sequences['F1']) for i in xrange(5): seq_new = ''.join(sequences['F'+str(i+2)]) (score, ali1, ali2) = align_ladder(seq_old, seq_new, score_gapopen=-10) if VERBOSE >= 3: pretty_print_pairwise_ali([ali1, ali2], name1='F'+str(i+1), name2='F'+str(i+2)) # Overlap: the first sequence is better at the start, the second at the end end1 = len(ali1.rstrip('-')) start2 = len(ali2) - len(ali2.lstrip('-')) len_overlap = end1 - start2 # There might a too short consensus, just join them with N if len_overlap < 50: consensus.append(seq_old) consensus.append('N' * 10) if i == 4: consensus.append(seq_new) else: seq_old = seq_new continue overlap1 = np.fromstring(ali1[start2: end1], 'S1') overlap2 = np.fromstring(ali2[start2: end1], 'S1') overlap = overlap1.copy() ind_overlap_mismatch = (overlap1 != overlap2).nonzero()[0] for j in ind_overlap_mismatch: if j < len(overlap) // 3: continue elif j < 2 * len(overlap) // 3: overlap[j] = 'N' else: overlap[j] = overlap2[j] overlap = overlap.tostring() consensus.append(ali1[:start2]) consensus.append(overlap) if i == 4: consensus.append(ali2[end1:]) else: seq_old = ali2[end1:].replace('-', '') consensus = ''.join(consensus) cons_rec = SeqRecord(Seq(consensus, IUPAC.ambiguous_dna), id=name, name=name, description=name+', genomewide') return cons_rec
# Global pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'AAATCGA' output = sap.align_global(seq1, seq2, band=5) print output # Overlap pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'ATCT' output = sap.align_overlap(seq1, seq2) print output # Overlap pairwise alignment cutting flanks seq1 = 'AAAGGTCTA' seq2 = 'ATCT' output = sap.align_overlap(seq1, seq2, cut_flanks=True) print output # Ladder pairwise alignment seq1 = 'AAAGGTCTA' seq2 = 'TCTAGGGAAACCC' output = sap.align_ladder(seq1, seq2) print output # Local pairwise alignment seq1 = 'AAAGGTCTACCGTAGCCT' seq2 = 'AAGTCTAC' output = sap.align_local(seq1, seq2) print output
def merge_fragments(sequences, name='', VERBOSE=0): '''Merge references at overlapping pairs''' from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio.Alphabet.IUPAC import ambiguous_dna from seqanpy import align_ladder from hivwholeseq.utils.sequence import pretty_print_pairwise_ali consensus = [] seq_old = ''.join(sequences['F1']) for i in xrange(5): seq_new = ''.join(sequences['F' + str(i + 2)]) (score, ali1, ali2) = align_ladder(seq_old, seq_new, score_gapopen=-10) if VERBOSE >= 3: pretty_print_pairwise_ali([ali1, ali2], name1='F' + str(i + 1), name2='F' + str(i + 2)) # Overlap: the first sequence is better at the start, the second at the end end1 = len(ali1.rstrip('-')) start2 = len(ali2) - len(ali2.lstrip('-')) len_overlap = end1 - start2 # There might a too short consensus, just join them with N if len_overlap < 50: consensus.append(seq_old) consensus.append('N' * 10) if i == 4: consensus.append(seq_new) else: seq_old = seq_new continue overlap1 = np.fromstring(ali1[start2:end1], 'S1') overlap2 = np.fromstring(ali2[start2:end1], 'S1') overlap = overlap1.copy() ind_overlap_mismatch = (overlap1 != overlap2).nonzero()[0] for j in ind_overlap_mismatch: if j < len(overlap) // 3: continue elif j < 2 * len(overlap) // 3: overlap[j] = 'N' else: overlap[j] = overlap2[j] overlap = overlap.tostring() consensus.append(ali1[:start2]) consensus.append(overlap) if i == 4: consensus.append(ali2[end1:]) else: seq_old = ali2[end1:].replace('-', '') consensus = ''.join(consensus) cons_rec = SeqRecord(Seq(consensus, IUPAC.ambiguous_dna), id=name, name=name, description=name + ', genomewide') return cons_rec
def overlap_ladder(): print('Test align_ladder') import seqanpy (score, ali1, ali2) = seqanpy.align_ladder('ACCGT', 'CGTAA') assert ali1 == 'ACCGT--' assert ali2 == '--CGTAA'