def AlignExhaustive(seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width=2): """Align two sequences. Align in chunks to keep memory low. Both sequences are roughly the same, thus align only in diagonal. """ gop, gep = -1.0, -1.0 matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder()) alignlib_lite.py_setDefaultSubstitutionMatrix(matrix) if seq_wobble.getLength() < 10000: if options.loglevel >= 6: options.stdlog.write("# using full dynamic programing matrix.\n") options.stdlog.flush() # do not penalize gaps at the end, because sometimes the last codon # might be missing alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_GLOBAL, gop, gep, 1, 1) else: diag_width = abs(seq_wobble.getLength() - seq_cds.getLength()) + 1 if options.loglevel >= 6: options.stdlog.write("# using dot alignment with diagonal %i\n" % diag_width) options.stdlog.flush() dots = alignlib_lite.py_makeAlignmentMatrixRow() for x in range(0, seq_wobble.getLength()): xr = seq_wobble.asResidue(x) for y in range(max(0, x - diag_width), min(seq_cds.getLength(), x + diag_width)): s = matrix.getValue(xr, seq_cds.asResidue(y)) if s >= 0: dots.addPair(x, y, float(s)) if options.loglevel >= 6: options.stdlog.write("# finished adding %i dots" % dots.getLength()) options.stdlog.flush() alignator_dummy = alignlib_lite.py_makeAlignatorPrebuilt(dots) alignator = alignlib_lite.py_makeAlignatorDots(alignator_dummy, gop, gep) alignator.align(map_p2c, seq_wobble, seq_cds)
def AlignExhaustive(seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width=2): """Align two sequences. Align in chunks to keep memory low. Both sequences are roughly the same, thus align only in diagonal. """ gop, gep = -1.0, -1.0 matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder()) alignlib_lite.py_setDefaultSubstitutionMatrix(matrix) if seq_wobble.getLength() < 10000: if options.loglevel >= 6: options.stdlog.write("# using full dynamic programing matrix.\n") options.stdlog.flush() # do not penalize gaps at the end, because sometimes the last codon # might be missing alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_GLOBAL, gop, gep, 1, 1) else: diag_width = abs(seq_wobble.getLength() - seq_cds.getLength()) + 1 if options.loglevel >= 6: options.stdlog.write( "# using dot alignment with diagonal %i\n" % diag_width) options.stdlog.flush() dots = alignlib_lite.py_makeAlignmentMatrixRow() for x in range(0, seq_wobble.getLength()): xr = seq_wobble.asResidue(x) for y in range(max(0, x - diag_width), min(seq_cds.getLength(), x + diag_width)): s = matrix.getValue(xr, seq_cds.asResidue(y)) if s >= 0: dots.addPair(x, y, float(s)) if options.loglevel >= 6: options.stdlog.write( "# finished adding %i dots" % dots.getLength()) options.stdlog.flush() alignator_dummy = alignlib_lite.py_makeAlignatorPrebuilt(dots) alignator = alignlib_lite.py_makeAlignatorDots( alignator_dummy, gop, gep) alignator.align(map_p2c, seq_wobble, seq_cds)
def AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width = 2, max_advance = 2 ): """advance in codons in seq_wobble and match to nucleotides in seq_cds. Due to alinglib this is all in one-based coordinates. Takes care of frameshifts. """ map_p2c.clear() gop, gep = -1.0, -1.0 matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder() ) pep_seq = seq_peptide.asString() cds_seq = seq_cds.asString() wobble_seq = seq_wobble.asString() lcds = seq_cds.getLength() lwobble = seq_wobble.getLength() y = 0 x = 0 last_start = None while x < lwobble and y < lcds: xr = seq_wobble.asResidue( x ) # skip over masked chars in wobble - these are gaps if seq_wobble.asChar(x) == "X": x += 1 continue # skip over masked chars in wobble - these are from # masked chars in the peptide sequence # Note to self: do not see all implications of this change # check later. if seq_wobble.asChar(x) == "N": x += 1 continue # skip over gaps in wobble if seq_wobble.asChar(x) == "-": x += 1 continue s = matrix.getValue( xr, seq_cds.asResidue(y) ) if options.loglevel >= 6: if (x % 3 == 0): c = seq_cds.asChar(y) + seq_cds.asChar(y+1) + seq_cds.asChar(y+2) options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y, Genomics.MapCodon2AA( c ), pep_seq[int(x/3)]) ) options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % \ (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s) )) # deal with mismatches if s <= 0: tmp_map_p2c = alignlib_lite.py_makeAlignmentVector() ## backtrack to previous three codons and align ## three codons for double frameshifts that span two codons and ## produce two X's and six WWWWWW. ## number of nucleotides to extend (should be multiple of 3) ## less than 12 caused failure for some peptides. d = 15 # extend by amound dx dx = (x % 3) + d x_start = max(0, x - dx ) # map to ensure that no ambiguous residue mappings # exist after re-alignment y_start = max(0, map_p2c.mapRowToCol( x_start, alignlib_lite.py_RIGHT )) if (x_start, y_start) == last_start: raise ValueError( "infinite loop detected" ) last_start = (x_start, y_start) x_end = min(x_start + 2 * d, len(wobble_seq) ) y_end = min(y_start + 2 * d, len(cds_seq) ) wobble_fragment = alignlib_lite.py_makeSequence(wobble_seq[x_start:x_end]) cds_fragment = alignlib_lite.py_makeSequence(cds_seq[y_start:y_end]) AlignExhaustive( wobble_fragment, cds_fragment, "", tmp_map_p2c, options ) if options.loglevel >= 10: options.stdlog.write("# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end, y_start, y_end, str(alignlib_lite.py_AlignmentFormatExplicit( tmp_map_p2c, wobble_fragment, cds_fragment )))) options.stdlog.flush() ## clear alignment map_p2c.removeRowRegion( x_start, x_end ) ngap = 0 last_x, last_y = None, None for xxx in range( tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo() ): yyy = tmp_map_p2c.mapRowToCol(xxx) if yyy >= 0: x = xxx + x_start y = yyy + y_start xr = seq_wobble.asResidue(x) s = matrix.getValue( seq_wobble.asResidue(x), seq_cds.asResidue(y) ) if s < 0: raise ValueError("mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y))) map_p2c.addPair( x, y, s) last_x, last_y = x, y if options.loglevel >= 6: options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \ (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s )) options.stdlog.flush() ngap = 0 else: ngap += 1 # treat special case of double frameshifts. They might cause a petide/wobble residue # to be eliminated and thus the translated sequences will differ. # simply delete the last residue between x and y and move to next codon. if ngap == 3: map_p2c.removeRowRegion( last_x, last_x + 1 ) last_x += 1 map_p2c.addPair( last_x, last_y ) if options.loglevel >= 6: options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \ (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s )) options.stdlog.flush() ngap = 0 ## exit condition if alignment is shorter than problematic residue ## need to catch this to avoid infinite loop. if tmp_map_p2c.getRowTo() < d: if lwobble - x <= 4: ## only last codon is missing, so ok break else: raise ValueError("failure to align in designated window.") s = 0 s = matrix.getValue( xr, seq_cds.asResidue(y) ) if s < 0: raise ValueError("mis-matching residues.") map_p2c.addPair( x, y, float(s) ) # advance to next residues x += 1 y += 1 # sanity checks assert( map_p2c.getRowTo() <= seq_wobble.getLength() ) assert( map_p2c.getColTo() <= seq_cds.getLength() )
def AlignCodonBased(seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width=2, max_advance=2): """advance in codons in seq_wobble and match to nucleotides in seq_cds. Due to alinglib this is all in one-based coordinates. Takes care of frameshifts. """ map_p2c.clear() gop, gep = -1.0, -1.0 matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder()) pep_seq = seq_peptide.asString() cds_seq = seq_cds.asString() wobble_seq = seq_wobble.asString() lcds = seq_cds.getLength() lwobble = seq_wobble.getLength() y = 0 x = 0 last_start = None while x < lwobble and y < lcds: xr = seq_wobble.asResidue(x) # skip over masked chars in wobble - these are gaps if seq_wobble.asChar(x) == "X": x += 1 continue # skip over masked chars in wobble - these are from # masked chars in the peptide sequence # Note to self: do not see all implications of this change # check later. if seq_wobble.asChar(x) == "N": x += 1 continue # skip over gaps in wobble if seq_wobble.asChar(x) == "-": x += 1 continue s = matrix.getValue(xr, seq_cds.asResidue(y)) if options.loglevel >= 6: if (x % 3 == 0): c = seq_cds.asChar(y) + seq_cds.asChar(y + 1) + seq_cds.asChar(y + 2) options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y, Genomics.MapCodon2AA(c), pep_seq[int(x / 3)])) options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s))) # deal with mismatches if s <= 0: tmp_map_p2c = alignlib_lite.py_makeAlignmentVector() # backtrack to previous three codons and align # three codons for double frameshifts that span two codons and # produce two X's and six WWWWWW. # number of nucleotides to extend (should be multiple of 3) # less than 12 caused failure for some peptides. d = 15 # extend by amound dx dx = (x % 3) + d x_start = max(0, x - dx) # map to ensure that no ambiguous residue mappings # exist after re-alignment y_start = max(0, map_p2c.mapRowToCol(x_start, alignlib_lite.py_RIGHT)) if (x_start, y_start) == last_start: raise ValueError("infinite loop detected") last_start = (x_start, y_start) x_end = min(x_start + 2 * d, len(wobble_seq)) y_end = min(y_start + 2 * d, len(cds_seq)) wobble_fragment = alignlib_lite.py_makeSequence( wobble_seq[x_start:x_end]) cds_fragment = alignlib_lite.py_makeSequence( cds_seq[y_start:y_end]) AlignExhaustive(wobble_fragment, cds_fragment, "", tmp_map_p2c, options) if options.loglevel >= 10: options.stdlog.write( "# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end, y_start, y_end, str( alignlib_lite.py_AlignmentFormatExplicit( tmp_map_p2c, wobble_fragment, cds_fragment)))) options.stdlog.flush() # clear alignment map_p2c.removeRowRegion(x_start, x_end) ngap = 0 last_x, last_y = None, None for xxx in range(tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo()): yyy = tmp_map_p2c.mapRowToCol(xxx) if yyy >= 0: x = xxx + x_start y = yyy + y_start xr = seq_wobble.asResidue(x) s = matrix.getValue(seq_wobble.asResidue(x), seq_cds.asResidue(y)) if s < 0: raise ValueError( "mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y))) map_p2c.addPair(x, y, s) last_x, last_y = x, y if options.loglevel >= 6: options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s)) options.stdlog.flush() ngap = 0 else: ngap += 1 # treat special case of double frameshifts. They might cause a petide/wobble residue # to be eliminated and thus the translated sequences will differ. # simply delete the last residue between x and y and move to # next codon. if ngap == 3: map_p2c.removeRowRegion(last_x, last_x + 1) last_x += 1 map_p2c.addPair(last_x, last_y) if options.loglevel >= 6: options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s)) options.stdlog.flush() ngap = 0 # exit condition if alignment is shorter than problematic residue # need to catch this to avoid infinite loop. if tmp_map_p2c.getRowTo() < d: if lwobble - x <= 4: # only last codon is missing, so ok break else: raise ValueError("failure to align in designated window.") s = 0 s = matrix.getValue(xr, seq_cds.asResidue(y)) if s < 0: raise ValueError("mis-matching residues.") map_p2c.addPair(x, y, float(s)) # advance to next residues x += 1 y += 1 # sanity checks assert (map_p2c.getRowTo() <= seq_wobble.getLength()) assert (map_p2c.getColTo() <= seq_cds.getLength())