Exemplo n.º 1
0
    def merge(self, filenames=None):
        """merge runs from parallel computations.
        """

        if SegmentedFile.isComplete(self.mFilenameFit):
            return True

        # remove unwanted results
        for x in (self.mFilenameTransfer, self.mFilenameOverhang,
                  self.mFilenameFit):
            for fn in glob.glob("%s.0*" % x):
                os.remove(fn)

        # merge the details file if all is complete
        if glob.glob("%s.0*" % self.mFilenameDetails):
            if not AddaModuleRecord.merge(self, (self.mFilenameDetails, )):
                return False

        if not AddaModuleRecord.merge(self, (self.mFilenameData, )):
            return False
        self.mNumChunks = 1
        self.readPreviousData(self.mFilenameData)
        self.finish()

        return True
Exemplo n.º 2
0
    def __init__(self, *args, **kwargs):

        AddaModuleRecord.__init__(self, *args, **kwargs)

        self.mFilenameSegments = self.mConfig.get("files", "output_segments",
                                                  "adda.segments")
        self.mFilenames = (self.mFilenameSegments, )

        self.covering_trees = self.mConfig.get("segments", "covering_trees",
                                               True)
        self.combine_repeats = self.mConfig.get("segments", "combine_repeats",
                                                True)

        self.normalize_matrix = self.mConfig.get('segments', 'normalize',
                                                 False)
        self.add_local_bias = self.mConfig.get('segments',
                                               'matrix_add_local_bias', False)
        self.permute_matrix = self.mConfig.get('segments', 'permute', False)

        if self.normalize_matrix:
            E.warn("matrix normalization is turned on")

        if self.add_local_bias:
            E.warn("adding local bias is turned on")

        if self.permute_matrix:
            E.warn("matrix permutation is turned on")
Exemplo n.º 3
0
    def __init__(self, *args, **kwargs):

        AddaModuleRecord.__init__(self, *args, **kwargs)

        self.mFilenameFit = self.mConfig.get("output", "output_fit",
                                             "adda.fit")
        self.mFilenameOverhang = self.mConfig.get("output",
                                                  "output_fit_overhang",
                                                  "adda.fit.overhang")
        self.mFilenameTransfer = self.mConfig.get("output",
                                                  "output_fit_transfer",
                                                  "adda.fit.transfer")
        self.mFilenameData = self.mConfig.get("output", "output_fit_data",
                                              "adda.fit.data")
        self.mFilenameDetails = self.mConfig.get("output",
                                                 "output_fit_details",
                                                 "adda.fit.details")
        self.mMinTransfer = float(self.mConfig.get("fit", "min_transfer"))
        self.mMinOverhang = float(self.mConfig.get("fit", "min_overhang"))
        self.mFilenameNids = self.mConfig.get("output", "output_nids",
                                              "adda.nids")
        self.mMaxSequenceLength = self.mConfig.get("segment",
                                                   "max_sequence_length",
                                                   10000)

        self.min_counts = self.mConfig.get("fit", "min_counts", 10)

        self.mFilenames = (self.mFilenameFit, self.mFilenameTransfer,
                           self.mFilenameOverhang)

        self.mOutfileDetails = None
        self.mOutfileData = None
        self.mDataIsComplete = False
Exemplo n.º 4
0
    def finish(self):

        self.mOutfile.close()        
        total = self.mNIdentical + self.mNDifferent
        self.info( "total=%i, identical=%i (%5.2f %%), different=%i (%5.2f %%)" % \
                       (total, self.mNIdentical, 100.0 * self.mNIdentical / total,
                        self.mNDifferent, 100.0 * self.mNDifferent / total ) )
        AddaModuleRecord.finish( self )
Exemplo n.º 5
0
 def finish(self):    
     
     self.mOutfile.close()
     
     self.info( "aligned: %i links input, %i links passed, %i links failed, %i links not found" %\
                    (self.mInput, self.mNPassed, self.mNFailed, self.mNNotFound ) )
     
     AddaModuleRecord.finish( self )
Exemplo n.º 6
0
 def finish(self):
     
     self.mOutfile.close()
     
     self.info( "graph: %i links input, %i links output, %i links merged" %\
                (self.mNLinksInput, self.mNLinksOutput, self.mNJoined ) )
     
     AddaModuleRecord.finish( self )
Exemplo n.º 7
0
    def finish(self):

        self.mOutfile.close()
        total = self.mNIdentical + self.mNDifferent
        self.info( "total=%i, identical=%i (%5.2f %%), different=%i (%5.2f %%)" % \
                       (total, self.mNIdentical, 100.0 * self.mNIdentical / total,
                        self.mNDifferent, 100.0 * self.mNDifferent / total ) )
        AddaModuleRecord.finish(self)
Exemplo n.º 8
0
    def __init__(self, *args, **kwargs):

        AddaModuleRecord.__init__(self, *args, **kwargs)

        self.mFilenameRealignment = self.mConfig.get("files",
                                                     "output_realignment",
                                                     "adda.realign")
        self.mFilenames = (self.mFilenameRealignment, )
Exemplo n.º 9
0
    def __init__(self, *args, **kwargs ):

        AddaModuleRecord.__init__( self, *args, **kwargs )

        self.mFilenameGraph = self.mConfig.get("files", "output_graph", "adda.graph" )
        self.mFilenames = (self.mFilenameGraph, )
        self.mMergeRepeats = self.mConfig.get( "graph", "merge_repeats", True )
        self.mMinDomainSize = int(self.mConfig.get('adda','min_domain_size'))
        self.mHeaders = ("query_nid","sbjct_nid","evalue","query_start","query_end","sbjct_start", "sbjct_end") 
Exemplo n.º 10
0
    def __init__(self, *args, **kwargs ):

        AddaModuleRecord.__init__( self, *args, **kwargs )

        self.mFilenameProfile = self.mConfig.get( "files", "output_profiles", "adda.profiles" )
        self.mScaleFactor  = self.mConfig.get( "profiles", "scale_factor", 0.3 )
        self.mMaxNumNeighbours = self.mConfig.get( "profiles", "max_neighbours", 1000)
        self.mMaxEvalue = self.mConfig.get( "profiles", "max_evalue", 0.0)
        
        self.mPrepareProfile = self.mConfig.get( "profiles", "prepare_profile", False ) 
Exemplo n.º 11
0
    def __init__(self, *args, **kwargs ):

        AddaModuleRecord.__init__( self, *args, **kwargs )

        self.mFilenameAlignments = self.mConfig.get("files","output_align", "adda.align" )
        self.mFilenameGraph = self.mConfig.get("files","output_graph", "adda.graph" )
        self.mFilenameIndex = self.mConfig.get("files","output_index", "adda.graph.index" )

        self.mFilenameProfiles = self.mConfig.get( "files", "output_profiles", "adda.profiles")
        self.mFilenameMst = self.mConfig.get( "files", "output_mst", "adda.mst" )              

        self.mUsePrebuiltProfiles = self.mConfig.get( "profiles", "use_prebuilt_profiles", False)
        
        self.mScaleFactor  = self.mConfig.get( "profiles", "scale_factor", 0.3)
        self.mMaxNumNeighbours = self.mConfig.get( "profiles", "max_neighbours", 1000 )
        self.mPrepareProfile = self.mConfig.get( "profiles", "prepare_profile", False ) 
        
        self.mMinOverlapResidues = self.mConfig.get( "align", "min_overlap_residues", 20 )
        self.mMinCoverage = self.mConfig.get( "align", "min_coverage", 0.2 )
        self.mMinOverlap = self.mConfig.get( "align", "min_overlap", 0.2 )
        self.mMask = self.mConfig.get( "align", "mask", False )
        self.mMethodsMask = map(int, self.mConfig.get( "align", "masks", "3,4" ).split(","))
        
        self.mUseCache = self.mConfig.get( "align", "use_cache", True )
        self.mCacheSize = self.mConfig.get( "align", "cache_size", 100 ) 

        ###############################################
        # options for zscore check
        self.mMinZScore = self.mConfig.get( "align", "min_zscore", 5.0 )
        self.mNumIterationsZScore = self.mConfig.get( "align", "num_iterations_zscore", 50 )

        # if score is 5 times the minimum score, do not compute zscore
        self.mSafetyThreshold = self.mConfig.get( "align", "safety_threshold", 5 )

        ###############################################
        # alignment parameters
        self.mGop = self.mConfig.get( "align", "gop", -10.0 )
        self.mGep = self.mConfig.get( "align", "gep", -1.0 )
        
        # minimum size for using a profile for alignments
        self.mMinProfileSize = self.mConfig.get( "align", "min_profile_size", 0 )

        # threshold parameters for significance check
        self.mMinAlignmentScore  = self.mConfig.get( "align", "min_alignment_score", 83.0 )
        self.mMinAlignmentMotifLength = self.mConfig.get( "align", "min_motif_length", 10 )

        self.mFilenames = (self.mFilenameAlignments, )
        
        self.mProfileBuilder = AddaProfiles.AddaProfiles( *args, **kwargs )

        # the cache to store alignandum objects
        self.mCache = {}        
Exemplo n.º 12
0
    def __init__(self, *args, **kwargs):

        AddaModuleRecord.__init__(self, *args, **kwargs)

        self.mFilenameProfile = self.mConfig.get("files", "output_profiles",
                                                 "adda.profiles")
        self.mScaleFactor = self.mConfig.get("profiles", "scale_factor", 0.3)
        self.mMaxNumNeighbours = self.mConfig.get("profiles", "max_neighbours",
                                                  1000)
        self.mMaxEvalue = self.mConfig.get("profiles", "max_evalue", 0.0)

        self.mPrepareProfile = self.mConfig.get("profiles", "prepare_profile",
                                                False)
Exemplo n.º 13
0
    def finish( self ):
        """finish processing.
        
        add entries for sequences who only appear in the sbjct field.
        """
        if not self.isSubset():
            nids = self.mFasta.getContigSizes().keys()
            nadded = 0

            for nid in sorted(nids):
                if nid not in self.mProfileLibrary:
                    self.applyMethod( AddaIO.NeighboursRecord( nid, [] ) )
                    nadded += 1
                
            self.mOutput += nadded
            self.info( "added %i profiles for sequences without neighbours" % nadded )

        self.mProfileLibrary.close()
        
        AddaModuleRecord.finish(self)        
Exemplo n.º 14
0
    def finish(self):
        """finish processing.
        
        add entries for sequences who only appear in the sbjct field.
        """
        if not self.isSubset():
            nids = self.mFasta.getContigSizes().keys()
            nadded = 0

            for nid in sorted(nids):
                if nid not in self.mProfileLibrary:
                    self.applyMethod(AddaIO.NeighboursRecord(nid, []))
                    nadded += 1

            self.mOutput += nadded
            self.info("added %i profiles for sequences without neighbours" %
                      nadded)

        self.mProfileLibrary.close()

        AddaModuleRecord.finish(self)
Exemplo n.º 15
0
    def __init__(self, *args, **kwargs ):
        
        AddaModuleRecord.__init__( self, *args, **kwargs )

        self.mFilenameSegments = self.mConfig.get("files","output_segments", "adda.segments" ) 
        self.mFilenames = (self.mFilenameSegments, )

        self.covering_trees = self.mConfig.get("segments", "covering_trees", True)
        self.combine_repeats = self.mConfig.get( "segments", "combine_repeats", True)

        self.normalize_matrix = self.mConfig.get('segments','normalize', False)
        self.add_local_bias =  self.mConfig.get('segments','matrix_add_local_bias', False )
        self.permute_matrix =  self.mConfig.get('segments','permute', False )

        if self.normalize_matrix:
            E.warn( "matrix normalization is turned on" )

        if self.add_local_bias:
            E.warn( "adding local bias is turned on" )

        if self.permute_matrix:
            E.warn( "matrix permutation is turned on" )
Exemplo n.º 16
0
    def merge(self, filenames = None ):
        """merge runs from parallel computations.
        """

        if SegmentedFile.isComplete( self.mFilenameFit ):
            return True

        # remove unwanted results
        for x in (self.mFilenameTransfer, self.mFilenameOverhang, self.mFilenameFit):
            for fn in glob.glob( "%s.0*" % x ):
                os.remove(fn)

        # merge the details file if all is complete
        if glob.glob( "%s.0*" % self.mFilenameDetails):
            if not AddaModuleRecord.merge( self, (self.mFilenameDetails, ) ): return False

        if not AddaModuleRecord.merge( self, (self.mFilenameData, ) ): return False
        self.mNumChunks = 1
        self.readPreviousData( self.mFilenameData )
        self.finish()
            
        return True
Exemplo n.º 17
0
    def __init__(self, *args, **kwargs ):

        AddaModuleRecord.__init__( self, *args, **kwargs )

        self.mFilenameFit = self.mConfig.get("output","fit", "adda.fit" )
        self.mFilenameOverhang = self.mConfig.get( "output", "fit_overhang", "adda.fit.overhang" )
        self.mFilenameTransfer = self.mConfig.get( "output", "fit_transfer", "adda.fit.transfer" )
        self.mFilenameData = self.mConfig.get( "output", "fit_data", "adda.fit.data" )
        self.mFilenameDetails = self.mConfig.get( "output", "fit_details", "adda.fit.details" )
        self.mMinTransfer = float(self.mConfig.get( "fit", "min_transfer" ))
        self.mMinOverhang = float(self.mConfig.get( "fit", "min_overhang" ))
        self.mFilenameNids = self.mConfig.get( "output", "nids", "adda.nids" )
        self.mMaxSequenceLength = self.mConfig.get( "segment", "max_sequence_length", 10000 )
        
        self.min_counts = self.mConfig.get( "fit", "min_counts", 10 )

        self.mFilenames = (self.mFilenameFit, 
                           self.mFilenameTransfer, 
                           self.mFilenameOverhang)

        self.mOutfileDetails = None
        self.mOutfileData = None
        self.mDataIsComplete = False
Exemplo n.º 18
0
    def isComplete( self ):
        '''check if files are complete'''

        if AddaModuleRecord.isComplete( self ):
            return True
        
        # If all the data files are complete, re-compute fit, transfer and overhang
        # only and then return as complete
        if SegmentedFile.isComplete( SegmentedFile.mangle( self.mFilenameData, self.getSlice()) ):
            return True

        if SegmentedFile.isComplete( self.mFilenameData ):
            return self.merge()
        
        return False
Exemplo n.º 19
0
    def isComplete(self):
        '''check if files are complete'''

        if AddaModuleRecord.isComplete(self):
            return True

        # If all the data files are complete, re-compute fit, transfer and overhang
        # only and then return as complete
        if SegmentedFile.isComplete(
                SegmentedFile.mangle(self.mFilenameData, self.getSlice())):
            return True

        if SegmentedFile.isComplete(self.mFilenameData):
            return self.merge()

        return False
Exemplo n.º 20
0
class AddaFit(AddaModuleRecord):
    """fit domains of a reference set to alignments and compute 
    parameters for ADDA's objective function.

    Briefly, each alignment between a pair of sequences is evaluated
    with respect of the domains in the sequences computing ``overhang``
    and ``transfer``.

    A domain might have ``overhang``, if it overlaps an alignment incompletely. 
    ``overhang`` is measured as the number of residues that are left uncovered. 

    ``transfer`` are the number of residues that the alignment links between any
    pair of domains of the same family in the two sequences. 

    input
       ``files:input_graph``: the pairwise alignment graph

       ``files:input_reference``: a reference domain definition

    output
       ``files:output_fit``: a config file with the estimated parameters

       ``files:output_fit_transfer``: a tab-separated histogram of transfer
          values.

       ``files:output_fit_overhang``: a tab-separated histogram of overhang
          values.

       ``files:output_fit_details``: details of the fitting procedure. This tab-separated
           table reports the transfer and overhang for combination of domains and alignment.

           class 
              domain family
           nid1
              sequence nid1
           dfrom1  
              domain start on nid1
           dto1  
              domain end on nid1
           afrom
              alignment start on nid1
           ato1 
              alignment end on nid1
           nid2
              sequence nid2
           dfrom
              domain start on nid2
           dto2
              domain end on nid2
           afrom2  
              alignment start on nid2
           ato2
              alignment end on nid2
           lali
              alignment length
           lx 
              length of domain on nid1
           ly 
              length of domain on nid2
           trans
              transfer value
           ptran
              percentage transfer (transfer/lali)
           atran
              average percentage transfer (transfer/ sqrt( lx * ly))
           score 
              alignment score (ln(evalue))
    """

    mName = "Fit"

    def __init__(self, *args, **kwargs):

        AddaModuleRecord.__init__(self, *args, **kwargs)

        self.mFilenameFit = self.mConfig.get("output", "output_fit",
                                             "adda.fit")
        self.mFilenameOverhang = self.mConfig.get("output",
                                                  "output_fit_overhang",
                                                  "adda.fit.overhang")
        self.mFilenameTransfer = self.mConfig.get("output",
                                                  "output_fit_transfer",
                                                  "adda.fit.transfer")
        self.mFilenameData = self.mConfig.get("output", "output_fit_data",
                                              "adda.fit.data")
        self.mFilenameDetails = self.mConfig.get("output",
                                                 "output_fit_details",
                                                 "adda.fit.details")
        self.mMinTransfer = float(self.mConfig.get("fit", "min_transfer"))
        self.mMinOverhang = float(self.mConfig.get("fit", "min_overhang"))
        self.mFilenameNids = self.mConfig.get("output", "output_nids",
                                              "adda.nids")
        self.mMaxSequenceLength = self.mConfig.get("segment",
                                                   "max_sequence_length",
                                                   10000)

        self.min_counts = self.mConfig.get("fit", "min_counts", 10)

        self.mFilenames = (self.mFilenameFit, self.mFilenameTransfer,
                           self.mFilenameOverhang)

        self.mOutfileDetails = None
        self.mOutfileData = None
        self.mDataIsComplete = False

    #--------------------------------------------------------------------------
    def isComplete(self):
        '''check if files are complete'''

        if AddaModuleRecord.isComplete(self):
            return True

        # If all the data files are complete, re-compute fit, transfer and overhang
        # only and then return as complete
        if SegmentedFile.isComplete(
                SegmentedFile.mangle(self.mFilenameData, self.getSlice())):
            return True

        if SegmentedFile.isComplete(self.mFilenameData):
            return self.merge()

        return False

    #--------------------------------------------------------------------------
    def startUp(self):

        if self.isComplete(): return

        #self.mMapId2Nid = AddaIO.readMapId2Nid( open( self.mFilenameNids, "r") )

        #self.info( "reading domains from %s" % self.mConfig.get( "files", "input_reference") )

        #infile = AddaIO.openStream( self.mConfig.get( "files", "input_reference") )
        #rx_include = self.mConfig.get( "fit", "family_include", "")
        #self.mDomainBoundaries = AddaIO.readMapNid2Domains( infile, self.mMapId2Nid, rx_include )
        #infile.close()

        # result containers - histograms
        self.mTransferValues = numpy.array([0] * (self.mMaxSequenceLength + 1),
                                           numpy.int)
        self.mOverhangValues = numpy.array([0] * (self.mMaxSequenceLength + 1),
                                           numpy.int)

        # used to store data from an aborted run
        self.mValues = []
        self.mContinueAt = None

        # extract options
        if self.mLogLevel >= 5:
            self.mOutfileDetails = self.openOutputStream(self.mFilenameDetails,
                                                         register=True)

            if not self.mContinueAt:
                self.mOutfileDetails.write("""# FAMILY:          domain family
# NID1:         sequence nid1
# DFROM1:       domain start on nid1
# DTO1:         domain end on nid1
# AFROM1:       ali start on nid1
# ATO1:         ali end on nid1
# NID2:         sequence nid2
# DFROM2:       domain start on nid2
# DTO2:         domain end on nid2
# AFROM2:       ali start on nid2
# ATO2:         ali end on nid2
# LALI:         alignment length
# LX:           length of domain on nid1
# LY:           length of domain on nid2
# TRANS:        transfer value
# PTRAN:        percentage transfer (transfer/lali)
# ATRAN:        average percentage transfer (transfer/ sqrt( LX * LY))
# SCORE:        score of alignment
class\tnid1\tdfrom1\tdto1\tafrom1\tato1\tdnid2\tdfrom2\tdto2\tafrom2\tato2\tlali\tlx\tly\ttrans\tptran\tatran\tscore\n"""
                                           )
                # flushing is important with multiprocessing - why?
                # if not flushed, the header and the EOF token appear twice.
                self.mOutfileDetails.flush()

        self.mOutfileData = self.openOutputStream(self.mFilenameData,
                                                  register=True)

        if not self.mContinueAt:
            self.mOutfileData.write(
                "class\tquery_nid\tsbjct_nid\ttransfer\tquery_overhang\tsbjct_overhang\n"
            )

    #--------------------------------------------------------------------------
    def registerExistingOutput(self, filename):

        if os.path.exists(filename):
            self.readPreviousData(filename)
            self.info("processing will continue after %s" %
                      (str(self.mContinueAt)))

    #--------------------------------------------------------------------------
    def processValues(self, values):
        """process data for a single query.
        
        The results are appended to self.mTransferValues and 
        self.mOverhangValues.

        Values are averaged per family, query, sbjct.
        This averages over repeats and query and sbjct at the same
        time.

        """

        values.sort()

        for g, vals in itertools.groupby(values, key=lambda x: x[:3]):

            vals = list(vals)

            transfers, overhangs = [], []
            for f, q, s, transfer, overhang1, overhang2 in vals:

                if transfer < self.mMinTransfer: continue

                transfers.append(transfer)
                overhangs.append(overhang1)
                overhangs.append(overhang2)

            if transfers:
                transfer = int(round(sum(transfers) / float(len(transfers))))
                self.mTransferValues[transfer] += 1

            if overhangs:
                overhang = int(
                    math.floor(sum(overhangs) / float(len(overhangs))))
                if overhang >= self.mMinOverhang:
                    self.mOverhangValues[overhang] += 1

    #--------------------------------------------------------------------------
    def readPreviousData(self, filename=None):
        """process existing output in filename to guess correct point to continue computation."""

        if filename == None: filename = self.mFilenameData

        self.info("reading previous data from %s" % filename)

        if not os.path.exists(filename):
            self.warn("file %s does not exist" % filename)
            return

        self.mTransferValues = numpy.array([0] * (self.mMaxSequenceLength + 1),
                                           numpy.int)
        self.mOverhangValues = numpy.array([0] * (self.mMaxSequenceLength + 1),
                                           numpy.int)

        infile = open(filename, "r")

        values = []

        for line in infile:
            if line.startswith("#"): continue
            if line.startswith("class"): continue

            try:
                (family, query_token, sbjct_token, transfer, overhang1,
                 overhang2) = line[:-1].split("\t")
            except ValueError:
                self.warn("parsing error in line %s\n" % line[:-1])
                continue

            transfer, overhang1, overhang2 = map(
                int, (transfer, overhang2, overhang1))

            if transfer < self.mMinTransfer: continue

            values.append((family, query_token, sbjct_token, transfer,
                           overhang1, overhang2))

        self.processValues(values)

        self.info("read previous data from %s: transfer=%i, overhang=%i" % \
                      (filename, len(self.mTransferValues), len(self.mOverhangValues) ))

        infile.close()

    #--------------------------------------------------------------------------
    def readPreviousDataFromDetails(self, filename=None):
        """process existing output in filename to guess correct point to continue computation."""

        if filename == None: filename = self.mFilenameDetails

        self.info("reading previous data from %s" % filename)

        if not os.path.exists(filename):
            self.warn("file %s does not exist" % filename)
            return

        self.mTransferValues = numpy.array([0] * (self.mMaxSequenceLength + 1),
                                           numpy.int)
        self.mOverhangValues = numpy.array([0] * (self.mMaxSequenceLength + 1),
                                           numpy.int)

        infile = open(filename, "r")

        def iterate_per_query(infile):

            last_query = None

            for line in infile:
                if line.startswith("#"): continue
                if line.startswith("class"): continue

                try:
                    (family, query_token, xfrom, xto, query_from, query_to,
                     sbjct_token, yfrom, yto, sbjct_from, sbjct_to, lali, lx,
                     ly, transfer, A, B, evalue) = line[:-1].split("\t")
                except ValueError:
                    self.warn("parsing error in line %s\n" % line[:-1])
                    continue

                if query_token != last_query:
                    if last_query: yield values
                    values = []
                    last_query = query_token

                transfer, lx, ly = map(int, (transfer, lx, ly))

                if transfer >= 0:
                    values.append((family, query_token, sbjct_token, transfer,
                                   lx - transfer, ly - transfer))

            if last_query:
                yield values
                self.mContinueAt = (query_token, sbjct_token)

            raise StopIteration

        for values in iterate_per_query(infile):
            self.processValues(values)

        self.info("read previous data from %s: transfer=%i, overhang=%i" % \
                      (filename, len(self.mTransferValues), len(self.mOverhangValues) ))

        infile.close()

    #--------------------------------------------------------------------------
    def applyMethod(self, neighbours):
        """estimate ADDA penalties.

        This method calculates the distribution of::

           lali / sqrt( d1 * d2 ) 

        The computation only concerns domains of the same class.
            
        For each class:
        get all nids that have that class
        for each pair of nids, check if there is a link

        Repeats cause some counts to be inflated (most pronounced with the immunoglobulins)

        For example:
          * nid1: 3 domains
          * nid2: 2 domains
        
        Alignments: depending on domain arrangement 1 to 3 * 2, or:
          * nid1: 2 domains
          * nid2: 1 domain
          * alignments: 1 or 2

        If you want to eliminate repeats: which one?
            
        This method normalizes per family and per sequence pair.
        """

        values = []

        for n in neighbours.mMatches:

            # ignore links to self and those between nids without domains
            if n.mQueryToken == n.mSbjctToken or \
                    str(n.mQueryToken) not in self.mMapNid2Domains or \
                    str(n.mSbjctToken) not in self.mMapNid2Domains:
                continue

            if self.mContinueAt:
                if (n.mQueryToken, n.mSbjctToken) == self.mContinueAt:
                    self.info("continuing processing at pair %s" %
                              str(self.mContinueAt))
                    self.mContinueAt = None
                continue

            qdomains = self.mMapNid2Domains[str(n.mQueryToken)]
            sdomains = self.mMapNid2Domains[str(n.mSbjctToken)]

            for family in set(qdomains.keys()).intersection(
                    set(sdomains.keys())):
                xdomains = qdomains[family]
                ydomains = sdomains[family]

                total_transfer = 0
                ntransfer = 0
                total_overhang = 0
                noverhang = 0

                for xfrom, xto in xdomains:

                    ovlx = min(xto, n.mQueryTo) - max(xfrom, n.mQueryFrom)
                    # no overlap between domain and alignment on query
                    if ovlx < 0: continue
                    lx = xto - xfrom

                    for yfrom, yto in ydomains:

                        # no overlap between domain and alignment on sbjct
                        ovly = min(yto, n.mSbjctTo) - max(yfrom, n.mSbjctFrom)
                        if ovly < 0: continue
                        ly = yto - yfrom

                        lali = min(n.mSbjctTo - n.mSbjctFrom,
                                   n.mQueryTo - n.mQueryFrom)

                        # map domain from query to sbjct
                        zfrom = max(xfrom - n.mQueryFrom + n.mSbjctFrom,
                                    n.mSbjctFrom)
                        zto = min(xto - n.mQueryFrom + n.mSbjctFrom,
                                  n.mSbjctTo)
                        transfer = max(0, min(zto, yto) - max(zfrom, yfrom))

                        A = float(transfer) / float(lali)
                        B = float(transfer) / math.sqrt(float(lx * ly))

                        if self.mOutfileDetails:
                            self.mOutfileDetails.write( "\t".join( \
                                    map(str, (family,
                                              n.mQueryToken, xfrom, xto, n.mQueryFrom, n.mQueryTo,
                                              n.mSbjctToken, yfrom, yto, n.mSbjctFrom, n.mSbjctTo,
                                              lali, lx, ly, transfer, A, B, n.mEvalue) )) + "\n" )
                            self.mOutfileDetails.flush()

                        if self.mOutfileData:
                            self.mOutfileData.write( "\t".join( \
                                    map(str, (family, n.mQueryToken, n.mSbjctToken,
                                              transfer, lx-transfer, ly-transfer) ) ) + "\n")
                            self.mOutfileData.flush()

                        if transfer >= 0:
                            values.append(
                                (family, n.mQueryToken, n.mSbjctToken,
                                 transfer, lx - transfer, ly - transfer))

        values.sort()
        self.processValues(values)

    #--------------------------------------------------------------------------
    def writeHistogram(self, outfile, bins, frequencies):
        '''write a histogram'''
        for bin, value in zip(bins, frequencies):
            outfile.write("%i\t%f\n" % (bin, value))

    #--------------------------------------------------------------------------
    def truncateCounts(self, counts):
        '''truncate counts.'''

        s = sum(counts)
        if s == 0: raise ValueError("no counts")

        # truncate
        ma = len(counts) - 1
        while ma > 0 and counts[ma] == 0:
            ma -= 1
        ma += 1

        mi = 0
        while mi < len(counts) and counts[mi] == 0:
            mi += 1

        bins = numpy.arange(mi, ma)
        counts = counts[mi:ma]

        return bins, counts

    #--------------------------------------------------------------------------
    def getCumulativeHistogram(self, counts, reverse=False):
        '''return a normalized and cumulative histogram for histogram.
        
        also truncates.
        '''

        bins, histogram = self.truncateCounts(counts)

        # cumulate
        if reverse:
            c = numpy.add.accumulate(numpy.array(histogram[::-1], numpy.float))
        else:
            c = numpy.add.accumulate(numpy.array(histogram, numpy.float))

        # normalize
        total = max(c)
        y = c / total

        if reverse: y = y[::-1].copy()

        return bins, y

    #--------------------------------------------------------------------------
    def finish(self):

        self.info("number of values: transfer=%i, overhang=%i" %
                  (len(self.mTransferValues), len(self.mOverhangValues)))

        if sum(self.mTransferValues) < self.min_counts or sum(
                self.mOverhangValues) < self.min_counts:
            self.warn(
                "no transfer or overhang values - no parameters computed")
            return

        self.mOutfile = self.openOutputStream(self.mFilenameFit,
                                              register=False)

        self.mOutfile.write("[optimise]\n")

        try:
            A, B, C, K = self.fitTransfer()
            self.mOutfile.write("sigmoid_min=%f\n" % A())
            self.mOutfile.write("sigmoid_max=%f\n" % B())
            self.mOutfile.write("sigmoid_k=%f\n" % K())
            self.mOutfile.write("sigmoid_c=%f\n" % C())
        except ValueError, msg:
            self.warn("could not compute overhang values: %s" % msg)

        try:
            E, F = self.fitOverhang()
            self.mOutfile.write("exponential_E=%f\n" % E())
            self.mOutfile.write("exponential_F=%f\n" % F())
        except ValueError:
            self.warn("could not compute overhang values: %s" % msg)

        self.mOutfile.close()

        ## close here, so that all is flushed before merge is called
        if self.mOutfileDetails: self.mOutfileDetails.close()

        AddaModuleRecord.finish(self)
Exemplo n.º 21
0
 def finish(self):
     self.mOutfile.close()
     AddaModuleRecord.finish(self)
Exemplo n.º 22
0
    def __init__(self, *args, **kwargs ):

        AddaModuleRecord.__init__( self, *args, **kwargs )

        self.mFilenameRealignment = self.mConfig.get("files","output_realignment", "adda.realign" ) 
        self.mFilenames = ( self.mFilenameRealignment, )
Exemplo n.º 23
0
 def finish(self):
     self.mOutfile.close()        
     AddaModuleRecord.finish( self )