def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: sequence2alignment.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.set_defaults() (options, args) = E.Start(parser) iterator = FastaIterator.FastaIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n" ) while 1: try: cur_record = iterator.next() except StopIteration: break ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) map_sequence2mali = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatExplicit(0, sequence, 0, "X" * l).copy(map_sequence2mali) options.stdout.write("\t".join( (cur_record.title, "ref", str(alignlib_lite.py_AlignmentFormatBlocks(map_sequence2mali)))) + "\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: sequence2alignment.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.set_defaults( ) (options, args) = E.Start(parser) iterator = FastaIterator.FastaIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "query\tsbjct\tquery_from\tquery_to\tsbjct_from\tsbjct_to\tquery_starts\tsbjct_starts\tblock_sizes\n") while 1: try: cur_record = iterator.next() except StopIteration: break ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) map_sequence2mali = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatExplicit(0, sequence, 0, "X" * l).copy(map_sequence2mali) options.stdout.write("\t".join(( cur_record.title, "ref", str(alignlib_lite.py_AlignmentFormatBlocks(map_sequence2mali)))) + "\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) E.Stop()
def PrintPrettyAlignment( seq_wobble, seq_cds, seq_pep, map_p2c, options ): """print a pretty alignment.""" f = alignlib_lite.py_AlignmentFormatExplicit( map_p2c, seq_wobble, seq_cds ) wobble_ali, cds_ali = f.mRowAlignment, f.mColAlignment wi, ci, pi = 0, 0, 0 frags_w, frags_c, frags_p = [], [], [] for x in range( 0, len( wobble_ali )): if wi % 3 == 0: if pi < len(seq_pep): frags_p.append( " %s " % seq_pep[pi]) frags_w.append (" ") frags_c.append (" ") pi += 1 frags_w.append( wobble_ali[x] ) frags_c.append( cds_ali[x] ) if wobble_ali[x] != "-": wi += 1 if len(frags_w) > 120 and len(frags_w) % 3 == 0: options.stdlog.write( "#" + "".join(frags_w) + "\n" ) options.stdlog.write( "#" + "".join(frags_p) + "\n" ) options.stdlog.write( "#" + "".join(frags_c) + "\n" ) options.stdlog.write( "#\n" ) frags_w, frags_c, frags_p = [], [], [] options.stdlog.write( "#" + "".join(frags_w) + "\n" ) options.stdlog.write( "#" + "".join(frags_p) + "\n" ) options.stdlog.write( "#" + "".join(frags_c) + "\n" ) options.stdlog.write( "#\n" )
def printPrettyAlignment(seq_wobble, seq_cds, seq_pep, map_p2c, options): """print a pretty alignment.""" f = alignlib_lite.py_AlignmentFormatExplicit(map_p2c, seq_wobble, seq_cds) wobble_ali, cds_ali = f.mRowAlignment, f.mColAlignment wi, ci, pi = 0, 0, 0 frags_w, frags_c, frags_p = [], [], [] for x in range(0, len(wobble_ali)): if wi % 3 == 0: if pi < len(seq_pep): frags_p.append(" %s " % seq_pep[pi]) frags_w.append(" ") frags_c.append(" ") pi += 1 frags_w.append(wobble_ali[x]) frags_c.append(cds_ali[x]) if wobble_ali[x] != "-": wi += 1 if len(frags_w) > 120 and len(frags_w) % 3 == 0: options.stdlog.write("#" + "".join(frags_w) + "\n") options.stdlog.write("#" + "".join(frags_p) + "\n") options.stdlog.write("#" + "".join(frags_c) + "\n") options.stdlog.write("#\n") frags_w, frags_c, frags_p = [], [], [] options.stdlog.write("#" + "".join(frags_w) + "\n") options.stdlog.write("#" + "".join(frags_p) + "\n") options.stdlog.write("#" + "".join(frags_c) + "\n") options.stdlog.write("#\n")
def AlignPair(pair, anchor=0): """align a pair of introns.""" map_intron_a2b = alignlib_lite.py_makeAlignmentVector() if param_loglevel >= 1: print "# aligning %s-%i with %s-%i: lengths %i and %i" % ( pair.mToken1, pair.mIntronId1, pair.mToken2, pair.mIntronId2, len(pair.mAlignedSequence1), len(pair.mAlignedSequence2)) sys.stdout.flush() s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor if param_method == "dialigned": dialign.Align(s1, s2, map_intron_a2b) elif param_method == "dialignedlgs": dialignlgs.Align(s1, s2, map_intron_a2b) elif param_method == "dbaligned": dba.Align(s1, s2, map_intron_a2b) elif param_method == "clusaligned": raise NotImplementedError("clustalw wrapper not up-to-date") clustal.Align(s1, s2, map_intron_a2b) if anchor: map_intron_a2b.removeRowRegion( anchor + len(pair.mAlignedSequence1) + 1, map_intron_a2b.getRowTo()) map_intron_a2b.removeRowRegion(1, anchor) map_intron_a2b.removeColRegion( anchor + len(pair.mAlignedSequence2) + 1, map_intron_a2b.getColTo()) map_intron_a2b.removeColRegion(1, anchor) map_intron_a2b.moveAlignment(-anchor, -anchor) if map_intron_a2b.getLength() == 0: if param_loglevel >= 1: print "# Error: empty intron alignment" return False seq1 = alignlib_lite.py_makeSequence(pair.mAlignedSequence1) seq2 = alignlib_lite.py_makeSequence(pair.mAlignedSequence2) data = alignlib_lite.py_AlignmentFormatExplicit(map_intron_a2b, seq1, seq2) pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo pair.mMethod = param_method pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps( ), map_intron_a2b.getLength() pair.mAligned = pair.mLength - pair.mNumGaps if param_loglevel >= 2: print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2 return True
def AlignPair(pair, anchor=0): """align a pair of introns.""" map_intron_a2b = alignlib_lite.py_makeAlignmentVector() if param_loglevel >= 1: print "# aligning %s-%i with %s-%i: lengths %i and %i" % (pair.mToken1, pair.mIntronId1, pair.mToken2, pair.mIntronId2, len(pair.mAlignedSequence1), len(pair.mAlignedSequence2)) sys.stdout.flush() s1 = "A" * anchor + pair.mAlignedSequence1 + "A" * anchor s2 = "A" * anchor + pair.mAlignedSequence2 + "A" * anchor if param_method == "dialigned": dialign.Align(s1, s2, map_intron_a2b) elif param_method == "dialignedlgs": dialignlgs.Align(s1, s2, map_intron_a2b) elif param_method == "dbaligned": dba.Align(s1, s2, map_intron_a2b) elif param_method == "clusaligned": raise NotImplementedError("clustalw wrapper not up-to-date") clustal.Align(s1, s2, map_intron_a2b) if anchor: map_intron_a2b.removeRowRegion( anchor + len(pair.mAlignedSequence1) + 1, map_intron_a2b.getRowTo()) map_intron_a2b.removeRowRegion(1, anchor) map_intron_a2b.removeColRegion( anchor + len(pair.mAlignedSequence2) + 1, map_intron_a2b.getColTo()) map_intron_a2b.removeColRegion(1, anchor) map_intron_a2b.moveAlignment(-anchor, -anchor) if map_intron_a2b.getLength() == 0: if param_loglevel >= 1: print "# Error: empty intron alignment" return False seq1 = alignlib_lite.py_makeSequence(pair.mAlignedSequence1) seq2 = alignlib_lite.py_makeSequence(pair.mAlignedSequence2) data = alignlib_lite.py_AlignmentFormatExplicit(map_intron_a2b, seq1, seq2) pair.mFrom1, pair.mAlignedSequence1, pair.mTo1 = data.mRowFrom, data.mRowAlignment, data.mRowTo pair.mFrom2, pair.mAlignedSequence2, pair.mTo2 = data.mColFrom, data.mColAlignment, data.mColTo pair.mMethod = param_method pair.mNumGaps, pair.mLength = map_intron_a2b.getNumGaps( ), map_intron_a2b.getLength() pair.mAligned = pair.mLength - pair.mNumGaps if param_loglevel >= 2: print "# alignment success", pair.mAlignedSequence1, pair.mAlignedSequence2 return True
def AlignCodonBased( seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width = 2, max_advance = 2 ): """advance in codons in seq_wobble and match to nucleotides in seq_cds. Due to alinglib this is all in one-based coordinates. Takes care of frameshifts. """ map_p2c.clear() gop, gep = -1.0, -1.0 matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder() ) pep_seq = seq_peptide.asString() cds_seq = seq_cds.asString() wobble_seq = seq_wobble.asString() lcds = seq_cds.getLength() lwobble = seq_wobble.getLength() y = 0 x = 0 last_start = None while x < lwobble and y < lcds: xr = seq_wobble.asResidue( x ) # skip over masked chars in wobble - these are gaps if seq_wobble.asChar(x) == "X": x += 1 continue # skip over masked chars in wobble - these are from # masked chars in the peptide sequence # Note to self: do not see all implications of this change # check later. if seq_wobble.asChar(x) == "N": x += 1 continue # skip over gaps in wobble if seq_wobble.asChar(x) == "-": x += 1 continue s = matrix.getValue( xr, seq_cds.asResidue(y) ) if options.loglevel >= 6: if (x % 3 == 0): c = seq_cds.asChar(y) + seq_cds.asChar(y+1) + seq_cds.asChar(y+2) options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y, Genomics.MapCodon2AA( c ), pep_seq[int(x/3)]) ) options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % \ (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s) )) # deal with mismatches if s <= 0: tmp_map_p2c = alignlib_lite.py_makeAlignmentVector() ## backtrack to previous three codons and align ## three codons for double frameshifts that span two codons and ## produce two X's and six WWWWWW. ## number of nucleotides to extend (should be multiple of 3) ## less than 12 caused failure for some peptides. d = 15 # extend by amound dx dx = (x % 3) + d x_start = max(0, x - dx ) # map to ensure that no ambiguous residue mappings # exist after re-alignment y_start = max(0, map_p2c.mapRowToCol( x_start, alignlib_lite.py_RIGHT )) if (x_start, y_start) == last_start: raise ValueError( "infinite loop detected" ) last_start = (x_start, y_start) x_end = min(x_start + 2 * d, len(wobble_seq) ) y_end = min(y_start + 2 * d, len(cds_seq) ) wobble_fragment = alignlib_lite.py_makeSequence(wobble_seq[x_start:x_end]) cds_fragment = alignlib_lite.py_makeSequence(cds_seq[y_start:y_end]) AlignExhaustive( wobble_fragment, cds_fragment, "", tmp_map_p2c, options ) if options.loglevel >= 10: options.stdlog.write("# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end, y_start, y_end, str(alignlib_lite.py_AlignmentFormatExplicit( tmp_map_p2c, wobble_fragment, cds_fragment )))) options.stdlog.flush() ## clear alignment map_p2c.removeRowRegion( x_start, x_end ) ngap = 0 last_x, last_y = None, None for xxx in range( tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo() ): yyy = tmp_map_p2c.mapRowToCol(xxx) if yyy >= 0: x = xxx + x_start y = yyy + y_start xr = seq_wobble.asResidue(x) s = matrix.getValue( seq_wobble.asResidue(x), seq_cds.asResidue(y) ) if s < 0: raise ValueError("mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y))) map_p2c.addPair( x, y, s) last_x, last_y = x, y if options.loglevel >= 6: options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \ (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s )) options.stdlog.flush() ngap = 0 else: ngap += 1 # treat special case of double frameshifts. They might cause a petide/wobble residue # to be eliminated and thus the translated sequences will differ. # simply delete the last residue between x and y and move to next codon. if ngap == 3: map_p2c.removeRowRegion( last_x, last_x + 1 ) last_x += 1 map_p2c.addPair( last_x, last_y ) if options.loglevel >= 6: options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % \ (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s )) options.stdlog.flush() ngap = 0 ## exit condition if alignment is shorter than problematic residue ## need to catch this to avoid infinite loop. if tmp_map_p2c.getRowTo() < d: if lwobble - x <= 4: ## only last codon is missing, so ok break else: raise ValueError("failure to align in designated window.") s = 0 s = matrix.getValue( xr, seq_cds.asResidue(y) ) if s < 0: raise ValueError("mis-matching residues.") map_p2c.addPair( x, y, float(s) ) # advance to next residues x += 1 y += 1 # sanity checks assert( map_p2c.getRowTo() <= seq_wobble.getLength() ) assert( map_p2c.getColTo() <= seq_cds.getLength() )
parser.add_option("-o", "--options", dest="options", type="string", help="BlastZ options.") parser.set_defaults(input_filename_seq1=None, input_filename_seq2=None, options="B=0 C=2") (options, args) = E.Start(parser) wrapper = BlastZ(options.options) import alignlib_lite seqs1 = Genomics.ReadPeptideSequences( open(options.input_filename_seq1, "r")) seqs2 = Genomics.ReadPeptideSequences( open(options.input_filename_seq2, "r")) seq1 = seqs1[seqs1.keys()[0]] seq2 = seqs2[seqs2.keys()[0]] result = alignlib_lite.py_makeAlignmentVector() wrapper.Align(seq1, seq2, result) print str( alignlib_lite.py_AlignmentFormatExplicit( result, alignlib_lite.py_makeSequence(seq1), alignlib_lite.py_makeSequence(seq2))) E.Stop()
def AlignCodonBased(seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width=2, max_advance=2): """advance in codons in seq_wobble and match to nucleotides in seq_cds. Due to alinglib this is all in one-based coordinates. Takes care of frameshifts. """ map_p2c.clear() gop, gep = -1.0, -1.0 matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder()) pep_seq = seq_peptide.asString() cds_seq = seq_cds.asString() wobble_seq = seq_wobble.asString() lcds = seq_cds.getLength() lwobble = seq_wobble.getLength() y = 0 x = 0 last_start = None while x < lwobble and y < lcds: xr = seq_wobble.asResidue(x) # skip over masked chars in wobble - these are gaps if seq_wobble.asChar(x) == "X": x += 1 continue # skip over masked chars in wobble - these are from # masked chars in the peptide sequence # Note to self: do not see all implications of this change # check later. if seq_wobble.asChar(x) == "N": x += 1 continue # skip over gaps in wobble if seq_wobble.asChar(x) == "-": x += 1 continue s = matrix.getValue(xr, seq_cds.asResidue(y)) if options.loglevel >= 6: if (x % 3 == 0): c = seq_cds.asChar(y) + seq_cds.asChar(y + 1) + seq_cds.asChar(y + 2) options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y, Genomics.MapCodon2AA(c), pep_seq[int(x / 3)])) options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s))) # deal with mismatches if s <= 0: tmp_map_p2c = alignlib_lite.py_makeAlignmentVector() # backtrack to previous three codons and align # three codons for double frameshifts that span two codons and # produce two X's and six WWWWWW. # number of nucleotides to extend (should be multiple of 3) # less than 12 caused failure for some peptides. d = 15 # extend by amound dx dx = (x % 3) + d x_start = max(0, x - dx) # map to ensure that no ambiguous residue mappings # exist after re-alignment y_start = max(0, map_p2c.mapRowToCol(x_start, alignlib_lite.py_RIGHT)) if (x_start, y_start) == last_start: raise ValueError("infinite loop detected") last_start = (x_start, y_start) x_end = min(x_start + 2 * d, len(wobble_seq)) y_end = min(y_start + 2 * d, len(cds_seq)) wobble_fragment = alignlib_lite.py_makeSequence( wobble_seq[x_start:x_end]) cds_fragment = alignlib_lite.py_makeSequence( cds_seq[y_start:y_end]) AlignExhaustive(wobble_fragment, cds_fragment, "", tmp_map_p2c, options) if options.loglevel >= 10: options.stdlog.write( "# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end, y_start, y_end, str( alignlib_lite.py_AlignmentFormatExplicit( tmp_map_p2c, wobble_fragment, cds_fragment)))) options.stdlog.flush() # clear alignment map_p2c.removeRowRegion(x_start, x_end) ngap = 0 last_x, last_y = None, None for xxx in range(tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo()): yyy = tmp_map_p2c.mapRowToCol(xxx) if yyy >= 0: x = xxx + x_start y = yyy + y_start xr = seq_wobble.asResidue(x) s = matrix.getValue(seq_wobble.asResidue(x), seq_cds.asResidue(y)) if s < 0: raise ValueError( "mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y))) map_p2c.addPair(x, y, s) last_x, last_y = x, y if options.loglevel >= 6: options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s)) options.stdlog.flush() ngap = 0 else: ngap += 1 # treat special case of double frameshifts. They might cause a petide/wobble residue # to be eliminated and thus the translated sequences will differ. # simply delete the last residue between x and y and move to # next codon. if ngap == 3: map_p2c.removeRowRegion(last_x, last_x + 1) last_x += 1 map_p2c.addPair(last_x, last_y) if options.loglevel >= 6: options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s)) options.stdlog.flush() ngap = 0 # exit condition if alignment is shorter than problematic residue # need to catch this to avoid infinite loop. if tmp_map_p2c.getRowTo() < d: if lwobble - x <= 4: # only last codon is missing, so ok break else: raise ValueError("failure to align in designated window.") s = 0 s = matrix.getValue(xr, seq_cds.asResidue(y)) if s < 0: raise ValueError("mis-matching residues.") map_p2c.addPair(x, y, float(s)) # advance to next residues x += 1 y += 1 # sanity checks assert (map_p2c.getRowTo() <= seq_wobble.getLength()) assert (map_p2c.getColTo() <= seq_cds.getLength())
def Align( self, method, anchor = 0, loglevel = 1 ): """align a pair of sequences. get rid of this and use a method class instead in the future """ map_a2b = alignlib_lite.py_makeAlignmentVector() s1 = "A" * anchor + self.mSequence1 + "A" * anchor s2 = "A" * anchor + self.mSequence2 + "A" * anchor self.strand = "+" if method == "dialign": dialign = WrapperDialign.Dialign( self.mOptionsDialign ) dialign.Align( s1, s2, map_a2b ) elif method == "blastz": blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ ) blastz.Align( s1, s2, map_a2b ) if blastz.isReverseComplement(): self.strand = "-" self.mSequence2 = Genomics.complement( self.mSequence2 ) elif method == "dialignlgs": dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS ) dialignlgs.Align( s1, s2, map_a2b ) elif method == "dba": dba = WrapperDBA.DBA() dba.Align( s1, s2, map_a2b ) elif method == "clustal": raise NotImplementedError( "clustal wrapper needs to be updated") clustal = WrapperClustal.Clustal() clustal.Align( s1, s2, map_a2b ) elif method == "nw": seq1 = alignlib_lite.py_makeSequence( s1 ) seq2 = alignlib_lite.py_makeSequence( s2 ) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_GLOBAL, gop=-12.0, gep=-2.0 ) alignator.align( map_a2b, seq1, seq2 ) elif method == "sw": seq1 = alignlib_lite.py_makeSequence( s1 ) seq2 = alignlib_lite.py_makeSequence( s2 ) alignlib_lite.py_performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw ) else: ## use callback function method(s1, s2, map_a2b) if map_a2b.getLength() == 0: raise AlignmentError("empty alignment") if anchor: map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() ) map_a2b.removeRowRegion( 1, anchor) map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() ) map_a2b.removeColRegion( 1, anchor) map_a2b.moveAlignment( -anchor, -anchor ) f = alignlib_lite.py_AlignmentFormatExplicit( map_a2b, alignlib_lite.py_makeSequence( self.mSequence1), alignlib_lite.py_makeSequence( self.mSequence2) ) self.mMethod = method self.mAlignment = map_a2b self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment f = alignlib_lite.py_AlignmentFormatEmissions( map_a2b ) self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment self.mAlignmentFrom1 = map_a2b.getRowFrom() self.mAlignmentTo1 = map_a2b.getRowTo() self.mAlignmentFrom2 = map_a2b.getColFrom() self.mAlignmentTo2 = map_a2b.getColTo() self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength() self.mAligned = self.mLength - self.mNumGaps self.SetPercentIdentity() self.SetBlockSizes()
input_filename_seq2 = None, options = "B=0 C=2") (options, args) = E.Start( parser ) wrapper = BlastZ( options.options ) import alignlib_lite seqs1 = Genomics.ReadPeptideSequences( open(options.input_filename_seq1, "r") ) seqs2 = Genomics.ReadPeptideSequences( open(options.input_filename_seq2, "r") ) seq1 = seqs1[seqs1.keys()[0]] seq2 = seqs2[seqs2.keys()[0]] result = alignlib_lite.py_makeAlignmentVector() wrapper.Align( seq1, seq2, result) print str( alignlib_lite.py_AlignmentFormatExplicit( result, alignlib_lite.py_makeSequence( seq1 ), alignlib_lite.py_makeSequence( seq2 ) ) ) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]" ) parser.add_option( "-f", "--format", dest="format", type="string", help="output format [Default=%default]" ) parser.add_option( "-e", "--expand", dest="expand", action="store_true", help="expand positions from peptide to nucleotide alignment [Default=%default]") parser.add_option( "-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option( "-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help="expect one-based coordinates. The default are zero based coordinates [Default=%default].") parser.add_option( "--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]" ) parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option( "-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option( "-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option( "--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help="given a set of previous alignments, only write new pairs [Default=%default].") parser.set_defaults( filename_sequences = None, filename_exons = None, filename_map = None, filename_outfile = None, no_gaps = False, format = "fasta", expand = False, require_codons = False, no_identical = False, min_length = 0, report_step = 100, one_based_coordinates = False, filename_filter = None) (options, args) = E.Start( parser, add_mysql_options = True ) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") ) else: sequences = {} if options.loglevel >= 1: options.stdlog.write( "# read %i sequences\n" % len(sequences) ) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries( open(options.filename_exons, "r") ) else: exons = {} if options.loglevel >= 1: options.stdlog.write( "# read %i exons\n" % len(exons) ) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read( line ) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write( "# read %i maps\n" % len(map_old2new) ) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write( "# reading filtering information.\n" ) sys.stdout.flush() map_pair2hids = {} if os.path.exists( options.filename_filter ): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator( infile ) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append( s ) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids) ) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write( "# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links( sys.stdin ): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write( "# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1) ) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write( "# read link %s\n" % str(link) ) row_seq = alignlib_lite.py_makeSequence( sequences[link.mQueryToken] ) col_seq = alignlib_lite.py_makeSequence( sequences[link.mSbjctToken] ) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment( link.mQueryAli, 3 ) link.mSbjctAli = ScaleAlignment( link.mSbjctAli, 3 ) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli ).copy( map_row2col ) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write( "# combining in row with %s\n" %\ str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mQueryToken].mMapOld2New ) )) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR ) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col ) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write( "# combining in col with %s\n" %\ str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mSbjctToken].mMapOld2New ) )) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR ) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col ) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError("out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" %\ (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq )) + "\n" ) ## check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write( "# %s\n" % str(map_row2col) ) options.stdlog.write( "# %s\n" % str(link) ) options.stdlog.write( "# %s\n" % str(map_old2new[link.mQueryToken]) ) options.stdlog.write( "# %s\n" % str(map_old2new[link.mSbjctToken]) ) options.stdlog.write( "#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq ) ) raise ValueError("incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) ## if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] ## Get overlapping segments segments = Exons.MatchExons( map_row2col, exons1, exons2 ) for a,b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in the input files. from1, to1 = GetAdjustedBoundaries( a, exons1 ) from2, to2 = GetAdjustedBoundaries( b, exons2 ) alignlib_lite.py_copyAlignment( tmp1_map_row2col, map_row2col, from1+1, to1, from2+1, to2 ) mode = Write( tmp1_map_row2col, row_seq, col_seq, link, no_gaps = options.no_gaps, no_identical = options.no_identical, min_length = options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile = outfile, pair_filter = map_pair2hid, format = options.format ) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write( map_row2col, row_seq, col_seq, link, min_length = options.min_length, no_gaps = options.no_gaps, no_identical = options.no_identical, outfile = outfile, pair_filter = map_pair2hids, format = options.format ) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map( lambda x,y: "%s=%i" % (x,y), counts.keys(), counts.values() ) )) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped) ) E.Stop()
def Write(map_row2col, row_seq, col_seq, link, no_gaps=False, no_identical=False, min_length=0, suffix1="", suffix2="", outfile=None, pair_filter=None, format="fasta"): """write alignment based on map_row2col.""" status = None filter_status = "new" if map_row2col.getLength() == 0: status = "empty" if not status: f = alignlib_lite.py_AlignmentFormatExplicit(map_row2col, row_seq, col_seq) row_from = map_row2col.getRowFrom() row_to = map_row2col.getRowTo() col_from = map_row2col.getColFrom() col_to = map_row2col.getColTo() row_ali, col_ali = f.mRowAlignment, f.mColAlignment if not status: if no_gaps: # remove gaps from fasta r = [] c = [] for x in range(len(row_ali)): if row_ali[x] != "-" and col_ali[x] != "-": r.append(row_ali[x]) c.append(col_ali[x]) row_ali = string.join(r, "") col_ali = string.join(c, "") if not status and len(row_ali) < min_length: status = "length" if not status and no_identical: if row_ali == col_ali: status = "identical" if not status: if pair_filter: id = "%s-%s" % (link.mQueryToken, link.mSbjctToken) if id in pair_filter: h = Genomics.GetHID(row_ali + ";" + col_ali) if h in pair_filter[id]: filter_status = "old" translation1 = Genomics.TranslateDNA2Protein(row_ali) translation2 = Genomics.TranslateDNA2Protein(col_ali) if "X" in translation1 or "x" in translation2: status = "stops" else: status = "success" if filter_status == "new": if format == "fasta": print ">%s%s %s %s\n%s\n>%s%s %s %s\n%s" % ( link.mQueryToken, suffix1, row_from, row_to, row_ali, link.mSbjctToken, suffix2, col_from, col_to, col_ali) elif format == "dummy": pass else: raise ValueError("unknown format") if outfile: outfile.write("%s%s\t%s%s\t%s\t%i\t%s\n" % (link.mQueryToken, suffix1, link.mSbjctToken, suffix2, status, map_row2col.getLength(), filter_status)) return status
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]") parser.add_option("-f", "--format", dest="format", type="string", help="output format [Default=%default]") parser.add_option( "-e", "--expand", dest="expand", action="store_true", help= "expand positions from peptide to nucleotide alignment [Default=%default]" ) parser.add_option("-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option("-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help= "expect one-based coordinates. The default are zero based coordinates [Default=%default]." ) parser.add_option("--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]") parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option("-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option("-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option("--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help= "given a set of previous alignments, only write new pairs [Default=%default]." ) parser.set_defaults(filename_sequences=None, filename_exons=None, filename_map=None, filename_outfile=None, no_gaps=False, format="fasta", expand=False, require_codons=False, no_identical=False, min_length=0, report_step=100, one_based_coordinates=False, filename_filter=None) (options, args) = E.Start(parser, add_mysql_options=True) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r")) else: sequences = {} if options.loglevel >= 1: options.stdlog.write("# read %i sequences\n" % len(sequences)) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r")) else: exons = {} if options.loglevel >= 1: options.stdlog.write("# read %i exons\n" % len(exons)) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read(line) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write("# read %i maps\n" % len(map_old2new)) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write("# reading filtering information.\n") sys.stdout.flush() map_pair2hids = {} if os.path.exists(options.filename_filter): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator(infile) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append(s) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids)) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write("# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links(sys.stdin): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write("# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1)) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write("# read link %s\n" % str(link)) row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken]) col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken]) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment(link.mQueryAli, 3) link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli).copy(map_row2col) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in row with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mQueryToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in col with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mSbjctToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError( "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" % (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) + "\n") # check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write("# %s\n" % str(map_row2col)) options.stdlog.write("# %s\n" % str(link)) options.stdlog.write("# %s\n" % str(map_old2new[link.mQueryToken])) options.stdlog.write("# %s\n" % str(map_old2new[link.mSbjctToken])) options.stdlog.write("#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) raise ValueError( "incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) # if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] # Get overlapping segments segments = Exons.MatchExons(map_row2col, exons1, exons2) for a, b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in # the input files. from1, to1 = GetAdjustedBoundaries(a, exons1) from2, to2 = GetAdjustedBoundaries(b, exons2) alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col, from1 + 1, to1, from2 + 1, to2) mode = Write(tmp1_map_row2col, row_seq, col_seq, link, no_gaps=options.no_gaps, no_identical=options.no_identical, min_length=options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile=outfile, pair_filter=map_pair2hid, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write(map_row2col, row_seq, col_seq, link, min_length=options.min_length, no_gaps=options.no_gaps, no_identical=options.no_identical, outfile=outfile, pair_filter=map_pair2hids, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map(lambda x, y: "%s=%i" % (x, y), counts.keys(), counts.values()))) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
if unaligned_pair and \ unaligned_pair.mToken1 == pair.mToken1 and \ unaligned_pair.mToken2 == pair.mToken2 and \ unaligned_pair.mIntronId1 == pair.mIntronId1: map_a2b = alignlib_lite.py_makeAlignmentVector() f = AlignmentFormatEmissions( pair.mFrom1, pair.mAlignedSequence1, pair.mFrom2, pair.mAlignedSequence2).copy(map_a2b) map_a2b.moveAlignment(-unaligned_pair.mFrom1 + 1, -unaligned_pair.mFrom2 + 1) data = alignlib_lite.py_AlignmentFormatExplicit(map_a2b, alignlib_lite.py_makeSequence( unaligned_pair.mAlignedSequence1), alignlib_lite.py_makeSequence(unaligned_pair.mAlignedSequence2)) from1, ali1, to1 = data.mRowFrom, data.mRowAlignment, data.mRowTo from2, ali2, to2 = data.mColFrom, data.mColAlignment, data.mColTo pair.mAlignedSequence1 = ali1 pair.mAlignedSequence2 = ali2 else: raise "sequence not found for pair %s" % str(pair) if param_do_gblocks: if param_loglevel >= 4: print "# length before: %i %i" % (len(pair.mAlignedSequence1), pair.mAligned) pair.mAlignedSequence1, pair.mAlignedSequence2 = gblocks.GetBlocks(
def Write( map_row2col, row_seq, col_seq, link, no_gaps = False, no_identical = False, min_length = 0, suffix1="", suffix2="", outfile = None, pair_filter = None, format = "fasta" ): """write alignment based on map_row2col.""" status = None filter_status = "new" if map_row2col.getLength() == 0: status = "empty" if not status: f = alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq ) row_from = map_row2col.getRowFrom() row_to = map_row2col.getRowTo() col_from = map_row2col.getColFrom() col_to = map_row2col.getColTo() row_ali, col_ali = f.mRowAlignment, f.mColAlignment if not status: if no_gaps: # remove gaps from fasta r = [] c = [] for x in range(len(row_ali)): if row_ali[x] != "-" and col_ali[x] != "-": r.append( row_ali[x] ) c.append( col_ali[x] ) row_ali = string.join(r, "") col_ali = string.join(c, "") if not status and len(row_ali) < min_length: status = "length" if not status and no_identical: if row_ali == col_ali: status = "identical" if not status: if pair_filter: id = "%s-%s" % (link.mQueryToken, link.mSbjctToken) if id in pair_filter: h = Genomics.GetHID( row_ali + ";" + col_ali ) if h in pair_filter[id]: filter_status = "old" translation1 = Genomics.TranslateDNA2Protein( row_ali ) translation2 = Genomics.TranslateDNA2Protein( col_ali ) if "X" in translation1 or "x" in translation2: status = "stops" else: status = "success" if filter_status == "new": if format == "fasta": print ">%s%s %s %s\n%s\n>%s%s %s %s\n%s" % (link.mQueryToken, suffix1, row_from, row_to, row_ali, link.mSbjctToken, suffix2, col_from, col_to, col_ali ) elif format == "dummy": pass else: raise ValueError("unknown format") if outfile: outfile.write( "%s%s\t%s%s\t%s\t%i\t%s\n" % (link.mQueryToken, suffix1, link.mSbjctToken, suffix2, status, map_row2col.getLength(), filter_status ) ) return status
def getAlignmentFull(m, q, t, options): """print alignment with gaps in both query and target.""" a = alignlib_lite.py_AlignmentFormatExplicit( m, alignlib_lite.py_makeSequence(q), alignlib_lite.py_makeSequence(t)) return a.mRowAlignment, a.mColAlignment
if param_is_compressed: if unaligned_pair and \ unaligned_pair.mToken1 == pair.mToken1 and \ unaligned_pair.mToken2 == pair.mToken2 and \ unaligned_pair.mIntronId1 == pair.mIntronId1: map_a2b = alignlib_lite.py_makeAlignmentVector() f = AlignmentFormatEmissions( pair.mFrom1, pair.mAlignedSequence1, pair.mFrom2, pair.mAlignedSequence2).copy(map_a2b) map_a2b.moveAlignment(-unaligned_pair.mFrom1 + 1, -unaligned_pair.mFrom2 + 1) data = alignlib_lite.py_AlignmentFormatExplicit( map_a2b, alignlib_lite.py_makeSequence( unaligned_pair.mAlignedSequence1), alignlib_lite.py_makeSequence( unaligned_pair.mAlignedSequence2)) from1, ali1, to1 = data.mRowFrom, data.mRowAlignment, data.mRowTo from2, ali2, to2 = data.mColFrom, data.mColAlignment, data.mColTo pair.mAlignedSequence1 = ali1 pair.mAlignedSequence2 = ali2 else: raise "sequence not found for pair %s" % str(pair) if param_do_gblocks: if param_loglevel >= 4: print "# length before: %i %i" % (len(
def read(self, line): data = string.split(line[:-1], "\t") if len(data) == 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled, ) = data elif len(data) == 25: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, ) = data elif len(data) == 24: ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, ) = data elif len(data) == 23: ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, ) = data self.mAlignmentString = "" else: raise ValueError, "unknown format: %i fields in line %s" % ( len(data), line[:-1]) (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity) = map( float, (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity)) (self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength, self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mNFrameShifts, self.mNAssembled) = map( int, (self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength, self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mNFrameShifts, self.mNAssembled)) if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector( ) if self.mQueryAli != "" and self.mSbjctAli != "": alignlib_lite.py_AlignmentFormatExplicit( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli).copy(self.mMapPeptide2Translation) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString)
def getAlignmentFull(m, q, t, options): """print alignment with gaps in both query and target.""" a = alignlib_lite.py_AlignmentFormatExplicit( m, alignlib_lite.py_makeSequence(q), alignlib_lite.py_makeSequence(t)) return a.mRowAlignment, a.mColAlignment
def read( self, line ): data = string.split( line[:-1], "\t") if len(data) == 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled, ) = data elif len(data) == 25: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, ) = data elif len(data) == 24: ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, ) = data elif len(data) == 23: ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, ) = data self.mAlignmentString = "" else: raise ValueError, "unknown format: %i fields in line %s" % (len(data), line[:-1]) (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity) = map (\ float, (self.score, self.mQueryCoverage, self.mPercentIdentity, self.mPercentSimilarity)) (self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength, self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mNFrameShifts, self.mNAssembled) = map (\ int, ( self.mPredictionId, self.mQueryFrom, self.mQueryTo, self.mQueryLength, self.mSbjctFrom, self.mSbjctTo, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mNGaps, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mNFrameShifts, self.mNAssembled)) if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector() if self.mQueryAli != "" and self.mSbjctAli != "": alignlib_lite.py_AlignmentFormatExplicit( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli).copy( self.mMapPeptide2Translation ) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )