def startUp( self ): if self.isComplete(): return if self.mAppend: self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile + self.getSlice(), "a" ) self.mContinueAt = self.mProfileLibrary.getLastInsertedKey() self.info("processing will continue after %s" % (str( self.mContinueAt ) ) ) else: self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile + self.getSlice(), "w", force=self.mForce ) # set default values self.mProfileLibrary.setLogOddor( alignlib.makeLogOddorDirichlet( self.mScaleFactor ) ) self.mProfileLibrary.setRegularizor( alignlib.makeRegularizorDirichletPrecomputed() ) self.mProfileLibrary.setWeightor( alignlib.makeWeightor() ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20 ) )
def startUp(self): if self.isComplete(): return if self.mAppend: self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile + self.getSlice(), "a") self.mContinueAt = self.mProfileLibrary.getLastInsertedKey() self.info("processing will continue after %s" % (str(self.mContinueAt))) else: self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile + self.getSlice(), "w", force=self.mForce) # set default values self.mProfileLibrary.setLogOddor( alignlib.makeLogOddorDirichlet(self.mScaleFactor)) self.mProfileLibrary.setRegularizor( alignlib.makeRegularizorDirichletPrecomputed()) self.mProfileLibrary.setWeightor(alignlib.makeWeightor()) alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.Protein20))
def startUp( self ): if self.isComplete(): return ############################################### # create objects for algorithm alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder( alignlib.Protein20 ) ) self.mLogOddor = alignlib.makeLogOddorDirichlet( self.mScaleFactor ) self.mRegularizor = alignlib.makeRegularizorDirichletPrecomputed() self.mWeightor = alignlib.makeWeightor() alignlib.getDefaultToolkit().setRegularizor( self.mRegularizor ) alignlib.getDefaultToolkit().setLogOddor( self.mLogOddor ) alignlib.getDefaultToolkit().setWeightor( self.mWeightor ) if self.mUsePrebuiltProfiles: self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfiles, "r" ) self.mProfileLibrary.setWeightor( self.mWeightor ) self.mProfileLibrary.setLogOddor( self.mLogOddor ) self.mProfileLibrary.setRegularizor( self.mRegularizor ) else: self.mProfileLibrary = None self.mIndexedNeighbours = cadda.IndexedNeighbours( self.mFilenameGraph, self.mFilenameIndex ) self.mChecker = self.checkLinkZScore self.mHeader = ("qdomain", "sdomain", "weight", "passed", "qstart", "qend", "qali", "sstart", "send", "sali", "score", "naligned", "ngaps", "zscore" ) self.mAlignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, self.mGop, self.mGep ) # the cache to store alignandum objects self.mCache = {} alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20 ) ) ## initialize counters self.mNPassed, self.mNFailed, self.mNNotFound = 0, 0, 0 self.mOutfile = self.openOutputStream( self.mFilenameAlignments ) if self.mContinueAt == None: self.mOutfile.write( "\t".join( self.mHeader ) + "\n" ) self.mOutfile.flush() self.mStartTime = time.time()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default]." ) parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default]." ) parser.add_option("-m", "--mode", dest="mode", type="choice", choices = ("global", "local" ), help="alignment mode, global=nw, local=sw [default=%default]." ) parser.set_defaults( gop = -12.0, gep = -2.0, format= "fasta", mode = "local", ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) if len(args) != 2: raise ValueError("please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info( "read 2 multiple alignments" ) mali1.readFromFile( IOTools.openFile( args[0], "r" ), format=options.format ) mali2.readFromFile( IOTools.openFile( args[1], "r" ), format=options.format ) cmali1 = Mali.convertMali2Alignlib( mali1 ) cmali2 = Mali.convertMali2Alignlib( mali2 ) if options.mode == "local": mode = alignlib.ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib.ALIGNMENT_GLOBAL alignator = alignlib.makeAlignatorDPFull( mode, options.gop, options.gep ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorDirichlet( 0.3 ) ) alignlib.setDefaultRegularizor( alignlib.makeRegularizorDirichletPrecomputed() ) cprofile1 = alignlib.makeProfile( cmali1 ) cprofile2 = alignlib.makeProfile( cmali2 ) result = alignlib.makeAlignmentVector() alignator.align( result, cprofile1, cprofile2 ) E.debug( "result=\n%s" % alignlib.AlignmentFormatEmissions( result) ) cmali1.add( cmali2, result ) outmali = Mali.convertAlignlib2Mali( cmali1, identifiers = mali1.getIdentifiers() + mali2.getIdentifiers() ) outmali.writeToFile( options.stdout, format=options.format) ## write footer and output benchmark information. E.Stop()
def _alignToProfile(infile, outfile, min_score=0): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile(open("../data/mouse.fasta")) src_mali = Mali.convertMali2Alignlib(mali) E.debug("read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns())) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0, 2): profile_mali.addSequence("%s%i" % (x, y), 0, n, x * n) profile_mali = Mali.convertMali2Alignlib(profile_mali) alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.DNA4)) alignlib.setDefaultLogOddor(alignlib.makeLogOddorUniform()) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile(profile_mali) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull(alignment_mode, -5.0, -0.5) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal(0, n, 0) build_mali.add(src_mali, m) outf = open(outfile, "w") outf_log = open(outfile + ".info", "w") outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append(re.sub("-", "", mali[pid])) ids.append(pid) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator(open(infile)): E.debug("adding %s" % s.title) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence(s.sequence) rseq = alignlib.makeSequence(rsequence) alignator.align(map_seq2profile, seq, profile) alignator.align(map_rseq2profile, rseq, profile) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts(m) covered = 0 for mm in r: build_mali.add(mm) sequences.append(sequence) ids.append(s.title) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write("\t".join( map(str, (s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence)), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns())))) + "\n") c.output += 1 #build_mali.expand( aa ) result = str( alignlib.MultAlignmentFormatPlain(build_mali, sequences, alignlib.UnalignedStacked)) for pid, data in zip(ids, result.split("\n")): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start) + 1, int(end), sequence)) outf.close() outf_log.close() E.info("%s\n" % str(c))
def _alignToProfile( infile, outfile, min_score = 0 ): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile( open("../data/mouse.fasta") ) src_mali = Mali.convertMali2Alignlib( mali ) E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() )) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0,2): profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n ) profile_mali = Mali.convertMali2Alignlib( profile_mali ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() ) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile( profile_mali ) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull( alignment_mode, -5.0, -0.5 ) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal( 0, n, 0 ) build_mali.add( src_mali, m ) outf = open( outfile, "w" ) outf_log = open( outfile + ".info", "w" ) outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append( re.sub( "-", "", mali[pid] ) ) ids.append( pid ) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator( open(infile)): E.debug("adding %s" % s.title ) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence( s.sequence ) rseq = alignlib.makeSequence( rsequence ) alignator.align( map_seq2profile, seq, profile ) alignator.align( map_rseq2profile, rseq, profile ) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts( m ) covered = 0 for mm in r: build_mali.add( mm ) sequences.append( sequence ) ids.append( s.title ) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write( "\t".join( map(str, ( s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence) ), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns()) ) ) ) + "\n" ) c.output += 1 #build_mali.expand( aa ) result = str(alignlib.MultAlignmentFormatPlain( build_mali, sequences, alignlib.UnalignedStacked )) for pid, data in zip(ids, result.split("\n") ): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) ) outf.close() outf_log.close() E.info( "%s\n" % str(c) )