Exemplo n.º 1
0
    def merge(self):
        """merge runs from parallel computations.

        returns true if merging was succecss.
        """
        if self.isComplete(): return

        infiles = glob.glob("%s*" % self.mFilenameProfile)
        # remove suffixes
        infiles = list(
            set([x[:-4] for x in infiles if x != self.mFilenameProfile]))
        infiles.sort()

        last_nid = None
        found = set()
        ninput, noutput, nfound, nunknown, nduplicate = 0, 0, 0, 0, 0
        tokens = set(self.mFasta.keys())

        self.mProfileLibrary = ProfileLibrary.ProfileLibrary(
            self.mFilenameProfile, "w")

        for filename in infiles:
            infile = ProfileLibrary.ProfileLibrary(filename, "r")

            for nid, profile in infile.iteritems_sorted():
                ninput += 1

                if nid in found:
                    nduplicates += 1
                    self.warn("duplicate nid: %i in file %s" % (nid, filename))
                if nid not in tokens:
                    nunknown += 1
                    self.warn("unknown nid: %i in file %s" % (nid, filename))
                found.add(nid)
                nfound += 1
                self.mProfileLibrary.add(nid, profile)
                noutput += 1

        missing = tokens.difference(found)
        if len(missing) > 0:
            self.warn("the following nids were missing: %s" % str(missing))

        self.info("adding %i missing nids" % len(missing))

        for nid in missing:
            self.applyMethod(AddaIO.NeighboursRecord(nid, []))

        self.info( "merging: parts=%i, ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\
                       (len(infiles), ninput, noutput, nfound, len(missing), nduplicate, nunknown ) )

        self.info("deleting %i parts" % len(infiles))
        for infile in infiles:
            fn, fi = ProfileLibrary.getFileNames(infile)
            os.remove(fn)
            os.remove(fi)

        return len(missing) == 0 and nduplicate == 0 and nunknown == 0
Exemplo n.º 2
0
    def merge(self):
        """merge runs from parallel computations.

        returns true if merging was succecss.
        """
        if self.isComplete(): return
        
        infiles = glob.glob( "%s*" % self.mFilenameProfile )
        # remove suffixes
        infiles = list(set([ x[:-4] for x in infiles if x != self.mFilenameProfile ]))
        infiles.sort()

        last_nid = None
        found = set()
        ninput, noutput, nfound, nunknown, nduplicate = 0, 0, 0, 0, 0
        tokens = set(self.mFasta.keys())

        self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfile,
                                                              "w" )

        for filename in infiles:
            infile = ProfileLibrary.ProfileLibrary( filename, "r" )

            for nid, profile in infile.iteritems_sorted():
                ninput += 1
                
                if nid in found:
                    nduplicates += 1
                    self.warn("duplicate nid: %i in file %s" % (nid, filename))
                if nid not in tokens:
                    nunknown += 1
                    self.warn("unknown nid: %i in file %s" % (nid, filename))
                found.add(nid)
                nfound += 1
                self.mProfileLibrary.add( nid, profile )
                noutput += 1

        missing = tokens.difference( found ) 
        if len(missing) > 0:
            self.warn( "the following nids were missing: %s" % str(missing) )
            
        self.info( "adding %i missing nids" % len(missing))
        
        for nid in missing:
            self.applyMethod( AddaIO.NeighboursRecord( nid, [] ) )

        self.info( "merging: parts=%i, ninput=%i, noutput=%i, nfound=%i, nmissing=%i, nduplicate=%i, nunknown=%i" %\
                       (len(infiles), ninput, noutput, nfound, len(missing), nduplicate, nunknown ) )

        self.info( "deleting %i parts" % len(infiles) )
        for infile in infiles:
            fn, fi = ProfileLibrary.getFileNames( infile )
            os.remove( fn )
            os.remove( fi )
        
        return len(missing) == 0 and nduplicate == 0 and nunknown == 0
Exemplo n.º 3
0
    def startUp(self):

        if self.isComplete(): return

        if self.mAppend:
            self.mProfileLibrary = ProfileLibrary.ProfileLibrary(
                self.mFilenameProfile + self.getSlice(), "a")
            self.mContinueAt = self.mProfileLibrary.getLastInsertedKey()
            self.info("processing will continue after %s" %
                      (str(self.mContinueAt)))
        else:
            self.mProfileLibrary = ProfileLibrary.ProfileLibrary(
                self.mFilenameProfile + self.getSlice(),
                "w",
                force=self.mForce)

        # set default values
        self.mProfileLibrary.setLogOddor(
            alignlib.makeLogOddorDirichlet(self.mScaleFactor))
        self.mProfileLibrary.setRegularizor(
            alignlib.makeRegularizorDirichletPrecomputed())
        self.mProfileLibrary.setWeightor(alignlib.makeWeightor())
        alignlib.setDefaultEncoder(alignlib.getEncoder(alignlib.Protein20))
Exemplo n.º 4
0
    def outputSummaryProfiles(self):
        """analyse the alignments."""

        if not os.path.exists(self.mFilenameProfiles):
            return {'nids': 0}

        self.mProfileLibrary = ProfileLibrary.ProfileLibrary(
            self.mFilenameProfiles, "r")

        nids = self.mProfileLibrary.keys()

        self.mOutfile.write(">%s\n" % self.mFilenameProfiles)
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), 100.0 * len(nids) / self.mNNids))

        return {'nids': len(nids)}
Exemplo n.º 5
0
    def startUp( self ):

        if self.isComplete(): return

        ###############################################
        # create objects for algorithm 
        alignlib.getDefaultToolkit().setEncoder( alignlib.getEncoder( alignlib.Protein20 ) )
        self.mLogOddor    = alignlib.makeLogOddorDirichlet( self.mScaleFactor )
        self.mRegularizor = alignlib.makeRegularizorDirichletPrecomputed()
        self.mWeightor    = alignlib.makeWeightor()

        alignlib.getDefaultToolkit().setRegularizor( self.mRegularizor )
        alignlib.getDefaultToolkit().setLogOddor( self.mLogOddor )
        alignlib.getDefaultToolkit().setWeightor( self.mWeightor )


        if self.mUsePrebuiltProfiles:
            self.mProfileLibrary = ProfileLibrary.ProfileLibrary( self.mFilenameProfiles, "r" )
            self.mProfileLibrary.setWeightor( self.mWeightor )
            self.mProfileLibrary.setLogOddor( self.mLogOddor )
            self.mProfileLibrary.setRegularizor( self.mRegularizor )

        else:
            self.mProfileLibrary = None
            self.mIndexedNeighbours = cadda.IndexedNeighbours( self.mFilenameGraph, self.mFilenameIndex )

        self.mChecker = self.checkLinkZScore
        self.mHeader = ("qdomain",
                        "sdomain",
                        "weight",
                        "passed",
                        "qstart",
                        "qend",
                        "qali",
                        "sstart",
                        "send",
                        "sali",
                        "score",
                        "naligned",
                        "ngaps",
                        "zscore" )

        self.mAlignator = alignlib.makeAlignatorDPFull( alignlib.ALIGNMENT_LOCAL, 
                                                        self.mGop,
                                                        self.mGep )

        # the cache to store alignandum objects
        self.mCache = {}        
        
        alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.Protein20 ) )

        ## initialize counters
        self.mNPassed, self.mNFailed, self.mNNotFound = 0, 0, 0

        self.mOutfile = self.openOutputStream( self.mFilenameAlignments )

        if self.mContinueAt == None:
            self.mOutfile.write( "\t".join( self.mHeader ) + "\n" ) 
            self.mOutfile.flush()

        self.mStartTime = time.time()
Exemplo n.º 6
0
    def isComplete(self):

        fn, fi = ProfileLibrary.getFileNames(self.mFilenameProfile +
                                             self.getSlice())
        return SegmentedFile.isComplete(fi)
Exemplo n.º 7
0
    def isComplete( self ):

        fn, fi = ProfileLibrary.getFileNames( self.mFilenameProfile + self.getSlice() )
        return SegmentedFile.isComplete( fi )