Пример #1
0
 def create(self):
     fd, self.mFilename = tempfile.mkstemp()
     outfile = SegmentedFile.openfile( self.mFilename, "w", slice="00-10" )
     for x in range(10): outfile.write( "%i\n" % x )
     outfile.close()
     outfile = SegmentedFile.openfile( self.mFilename, "w", slice="10-20" )
     for x in range(10,20): outfile.write( "%i\n" % x )
     outfile.close()
Пример #2
0
 def create(self):
     fd, self.mFilename = tempfile.mkstemp()
     outfile = SegmentedFile.openfile(self.mFilename, "w", slice="00-10")
     for x in range(10):
         outfile.write("%i\n" % x)
     outfile.close()
     outfile = SegmentedFile.openfile(self.mFilename, "w", slice="10-20")
     for x in range(10, 20):
         outfile.write("%i\n" % x)
     outfile.close()
Пример #3
0
    def outputSummaryGraph( self ):
        """analyse the alignments."""

        return {}

        infile = SegmentedFile.openfile( self.mFilenameGraph, "r" )

        nlinks = 0
        queries, sbjcts = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "query_nid"): continue
            
            nlinks += 1
            query, sbjct = line[:-1].split("\t")[:2]
            queries.add( query )
            sbjcts.add( sbjct )
        infile.close()

        self.mOutfile.write( ">%s\n" % self.mFilenameGraph )
        self.mOutfile.write( "nlinks\t%i\n" % nlinks )
        self.mOutfile.write( "nqueries\t%i\t%5.2f\n" % (len(queries), 100.0 * len(queries) / self.mNNids ) )
        self.mOutfile.write( "nsbjcts\t%i\t%5.2f\n" % (len(sbjcts), 100.0 * len(sbjcts) / self.mNNids ) )
        nids = queries.union( sbjcts )
        self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) )
        
        return { 'nids' : len(nids), 'links' : nlinks }
Пример #4
0
    def outputSummaryAlignments(self):
        """analyse the alignments."""

        infile = SegmentedFile.openfile(self.mFilenameAlignments, "r")

        ninput, naccepted = 0, 0
        nids, domains = set(), set()

        for line in infile:
            if line[0] == "#": continue
            if line.startswith("passed"): continue

            ninput += 1
            (code, query, sbjct, estimate,
             qstart, qend, qali, sstart, send, sali,
             score, naligned, ngaps, zscore) =\
             line[:-1].split("\t")

            nids.add(query.split("_")[0])
            nids.add(sbjct.split("_")[0])
            domains.add(query)
            domains.add(sbjct)

            if code == "+": naccepted += 1

        infile.close()

        self.mOutfile.write(">%s\n" % self.mFilenameAlignments)
        self.mOutfile.write("ntotal\t%i\n" % ninput)
        self.mOutfile.write("naccepted\t%i\n" % naccepted)
        self.mOutfile.write("nrejected\t%i\n" % (ninput - naccepted))

        return {'nids': len(nids), 'domains': len(domains)}
Пример #5
0
    def outputSummaryMst(self):
        """analyse the alignments."""

        infile = SegmentedFile.openfile(self.mFilenameMst, "r")

        nlinks = 0
        nids, domains = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue

            nlinks += 1
            query, sbjct = line[:-1].split("\t")[:2]
            nids.add(query.split("_")[0])
            nids.add(sbjct.split("_")[0])
            domains.add(query)
            domains.add(sbjct)

        infile.close()

        self.mOutfile.write(">%s\n" % self.mFilenameMst)
        self.mOutfile.write("nlinks\t%i\n" % nlinks)
        self.mOutfile.write("ndomains\t%i\n" % len(domains))
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), 100.0 * len(nids) / self.mNNids))

        return {'nids': len(nids), 'domains': len(domains)}
Пример #6
0
    def outputSummaryGraph(self):
        """analyse the alignments."""

        return {}

        infile = SegmentedFile.openfile(self.mFilenameGraph, "r")

        nlinks = 0
        queries, sbjcts = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("query_nid"): continue

            nlinks += 1
            query, sbjct = line[:-1].split("\t")[:2]
            queries.add(query)
            sbjcts.add(sbjct)
        infile.close()

        self.mOutfile.write(">%s\n" % self.mFilenameGraph)
        self.mOutfile.write("nlinks\t%i\n" % nlinks)
        self.mOutfile.write("nqueries\t%i\t%5.2f\n" %
                            (len(queries), 100.0 * len(queries) / self.mNNids))
        self.mOutfile.write("nsbjcts\t%i\t%5.2f\n" %
                            (len(sbjcts), 100.0 * len(sbjcts) / self.mNNids))
        nids = queries.union(sbjcts)
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), 100.0 * len(nids) / self.mNNids))

        return {'nids': len(nids), 'links': nlinks}
Пример #7
0
    def outputSummaryResult(self):
        """analyse the alignments."""

        infile = SegmentedFile.openfile(self.mFilenameResult, "r")

        ndomains = 0
        nids, families = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue

            ndomains += 1
            nid, start, end, family = line[:-1].split("\t")
            nids.add(nid)
            families.add(family)

        infile.close()

        self.mOutfile.write(">%s\n" % self.mFilenameResult)
        self.mOutfile.write("ndomains\t%i\n" % ndomains)
        self.mOutfile.write("nfamilies\t%i\n" % len(families))
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), 100.0 * len(nids) / self.mNNids))

        return {
            'nids': len(nids),
            'domains': ndomains,
            'families': len(families)
        }
Пример #8
0
    def outputSummaryMst( self ):
        """analyse the alignments."""

        infile = SegmentedFile.openfile( self.mFilenameMst, "r" )

        nlinks = 0
        nids, domains = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            
            nlinks += 1
            query, sbjct = line[:-1].split("\t")[:2]
            nids.add( query.split("_")[0])
            nids.add( sbjct.split("_")[0])
            domains.add( query )
            domains.add( sbjct )

        infile.close()

        self.mOutfile.write( ">%s\n" % self.mFilenameMst )
        self.mOutfile.write( "nlinks\t%i\n" % nlinks )
        self.mOutfile.write( "ndomains\t%i\n" % len(domains) )
        self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) )
        
        return { 'nids' : len(nids), 'domains' : len(domains) }
Пример #9
0
    def openOutputStream(self, filename, register = False ):
        """opens an output stream.
        
        If the output filename exists an error is raised unless
        1. mForce is set: the existing file will be overwritten
        2. mAppend is set: data will be appended. The registerExistingOutput
            method is called to give the module the chance to advance 
            the input stream to the appropriate point for continuation.
        
        If mSlice is set, the name will be mangled to reflect the slice.
        If register is true, registerExistingOutput will be called.
        """


        if self.mAppend:
            mode = "a"
        else:
            mode = "w"

        self.debug( "%s%s opening with mode %s" % (filename, self.getSlice(), mode ))
        return SegmentedFile.openfile( filename, 
                                       mode,
                                       slice = self.getSlice(),
                                       force = self.mForce,
                                       append_callback = self.readPreviousData,
                                       )
Пример #10
0
    def outputSummaryAlignments( self ):
        """analyse the alignments."""

        infile = SegmentedFile.openfile( self.mFilenameAlignments, "r" )

        ninput, naccepted = 0, 0
        nids, domains = set(), set()

        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "passed"): continue
            
            ninput += 1
            (code, query, sbjct, estimate, 
             qstart, qend, qali, sstart, send, sali, 
             score, naligned, ngaps, zscore) =\
             line[:-1].split("\t")

            nids.add( query.split("_")[0])
            nids.add( sbjct.split("_")[0])
            domains.add( query )
            domains.add( sbjct )
            
            if code == "+": naccepted += 1
            
        infile.close()

        self.mOutfile.write( ">%s\n" % self.mFilenameAlignments )
        self.mOutfile.write( "ntotal\t%i\n" % ninput )
        self.mOutfile.write( "naccepted\t%i\n" % naccepted )
        self.mOutfile.write( "nrejected\t%i\n" % (ninput - naccepted) )

        return { 'nids' : len(nids), 'domains' : len(domains) }
Пример #11
0
 def checkContents(self):
     self.checkToken(self.mFilename)
     infile = SegmentedFile.openfile(self.mFilename,
                                     "r",
                                     has_header=self.mHasHeader)
     data = [int(x) for x in infile]
     self.assertEqual(data, range(20))
Пример #12
0
 def checkContents(self):
     self.create()
     self.assertEqual( SegmentedFile.merge( self.mFilename ), True )
     self.checkToken( self.mFilename )
     infile = SegmentedFile.openfile( self.mFilename, "r" )
     data = [ x for x in infile ]
     self.assertEqual( data[1], "header1\n" )
     self.assertEqual( data[0], "#comment1\n" )
     self.assertEqual( data[12], "#comment2\n" )
     self.assertEqual( [int(x) for x in data[2:12] + data[13:]], range( 20 ) )
Пример #13
0
 def checkContents(self):
     self.create()
     self.assertEqual(SegmentedFile.merge(self.mFilename), True)
     self.checkToken(self.mFilename)
     infile = SegmentedFile.openfile(self.mFilename, "r")
     data = [x for x in infile]
     self.assertEqual(data[1], "header1\n")
     self.assertEqual(data[0], "#comment1\n")
     self.assertEqual(data[12], "#comment2\n")
     self.assertEqual([int(x) for x in data[2:12] + data[13:]], range(20))
Пример #14
0
    def getComponents( self ):
        '''return components.'''

        componentor = Components.SComponents()

        infile = SegmentedFile.openfile( self.mFilenameInput, "r" )

        ninput = 0
        for line in infile:
            if line[0] == "#": continue
            
            qdomain, sdomain = line[:-1].split("\t")[:2]
            componentor.add( qdomain, sdomain )
            ninput += 1

        self.info( "computing components with %i links" % ninput)

        return componentor.getComponents()
Пример #15
0
    def getComponents(self):
        '''return components.'''

        componentor = Components.SComponents()

        infile = SegmentedFile.openfile(self.mFilenameInput, "r")

        ninput = 0
        for line in infile:
            if line[0] == "#": continue

            qdomain, sdomain = line[:-1].split("\t")[:2]
            componentor.add(qdomain, sdomain)
            ninput += 1

        self.info("computing components with %i links" % ninput)

        return componentor.getComponents()
Пример #16
0
    def outputSummarySegments( self ):
        """analyse the alignments."""

        infile = SegmentedFile.openfile( self.mFilenameSegments, "r" )

        ndomains = 0
        nids = set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            
            ndomains += 1
            nid, node, parent, level, start, end = line[:-1].split("\t")
            nids.add(nid)

        infile.close()

        self.mOutfile.write( ">%s\n" % self.mFilenameSegments )
        self.mOutfile.write( "ndomains\t%i\n" % ndomains )
        self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) )

        return { 'nids' : len(nids), 'domains' : ndomains }
Пример #17
0
    def outputSummaryNids( self ):
        
        infile = SegmentedFile.openfile( self.mFilenameNids, "r" )

        ndomains = 0
        nids = set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            
            nid, pid, hid, length, sequence = line[:-1].split("\t")
            nids.add(nid)

        infile.close()

        self.mNids = nids
        self.mNNids = len(self.mNids)

        self.mOutfile.write( ">%s\n" % self.mFilenameNids )
        self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), len(nids) / self.mNNids ) )

        return { 'nids' : len(nids) }
Пример #18
0
    def outputSummaryNids(self):

        infile = SegmentedFile.openfile(self.mFilenameNids, "r")

        ndomains = 0
        nids = set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue

            nid, pid, hid, length, sequence = line[:-1].split("\t")
            nids.add(nid)

        infile.close()

        self.mNids = nids
        self.mNNids = len(self.mNids)

        self.mOutfile.write(">%s\n" % self.mFilenameNids)
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), len(nids) / self.mNNids))

        return {'nids': len(nids)}
Пример #19
0
    def outputSummarySegments(self):
        """analyse the alignments."""

        infile = SegmentedFile.openfile(self.mFilenameSegments, "r")

        ndomains = 0
        nids = set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue

            ndomains += 1
            nid, node, parent, level, start, end = line[:-1].split("\t")
            nids.add(nid)

        infile.close()

        self.mOutfile.write(">%s\n" % self.mFilenameSegments)
        self.mOutfile.write("ndomains\t%i\n" % ndomains)
        self.mOutfile.write("nnids\t%i\t%5.2f\n" %
                            (len(nids), 100.0 * len(nids) / self.mNNids))

        return {'nids': len(nids), 'domains': ndomains}
Пример #20
0
    def outputSummaryClusters( self ):
        """analyse the alignments."""

        infile = SegmentedFile.openfile( self.mFilenameClusters, "r" )

        ndomains = 0
        nids, families = set(), set()
        for line in infile:
            if line[0] == "#": continue
            if line.startswith( "nid"): continue
            
            ndomains += 1
            nid, start, end, family = line[:-1].split("\t")
            nids.add(nid)
            families.add(family)

        infile.close()

        self.mOutfile.write( ">%s\n" % self.mFilenameClusters )
        self.mOutfile.write( "ndomains\t%i\n" % ndomains )
        self.mOutfile.write( "nfamilies\t%i\n" % len(families) )
        self.mOutfile.write( "nnids\t%i\t%5.2f\n" % (len(nids), 100.0 * len(nids) / self.mNNids ) )

        return { 'nids' : len(nids), 'domains' : ndomains, 'families': len(families) }
Пример #21
0
 def checkContents(self):
     infile = SegmentedFile.openfile(self.mFilename, "r")
     data = [int(x) for x in infile]
     self.assertEqual(data, range(10))
Пример #22
0
 def checkContents( self ):
     infile = SegmentedFile.openfile( self.mFilename, "r" )
     data = [int(x) for x in infile ]
     self.assertEqual( data, range( 10 ) )
Пример #23
0
 def checkContents(self):
     self.checkToken(self.mFilename)
     infile = SegmentedFile.openfile(self.mFilename, "r")
     data = [x for x in infile]
     self.assertEqual(data[0], "header\n")
     self.assertEqual([int(x) for x in data[1:]], range(20))
Пример #24
0
 def create(self):
     outfile = SegmentedFile.openfile(self.mFilename, "w")
     for x in range(10):
         outfile.write("%i\n" % x)
     outfile.close()
Пример #25
0
 def checkContents( self ):
     self.checkToken( self.mFilename )
     infile = SegmentedFile.openfile( self.mFilename, "r", has_header = self.mHasHeader )
     data = [ x for x in infile ]
     self.assertEqual( data[0], "header\n" )
     self.assertEqual( [int(x) for x in data[1:]], range( 20 ) )
Пример #26
0
    def applyMethod(self ):
        """apply the method.
        """

        infile = SegmentedFile.openfile( self.mFilenameClusters, "r" )

        family2domains = collections.defaultdict( list )
        nid2domains = collections.defaultdict( list )

        ndomains = 0
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue
            nid, start, end, family = line[:-1].split("\t")
            nid = int(nid)
            nid2domains[nid].append( (int(start),int(end),family) )
            family2domains[family].append( (nid,int(end)-int(start) ) )
            ndomains += 1

        self.info( "collected: nsequences=%i, ndomains=%i, nfamilies=%i" %\
                       (len(nid2domains), ndomains, len(family2domains) ) )
        
        family_id = len(family2domains) 

        self.mOutfile.write( "nid\tstart\tend\tfamily\n" )

        # output domains per nid
        seqs = self.mFasta.getContigSizes()
        nids = sorted(seqs.keys())

        nfull_singletons = 0
        npartial_singletons = 0
        ndomains = 0

        # compute stats at the same time
        seq_lengths = seqs.values()
        max_length = max(seq_lengths)
        # compute summary per family
        # and compute full histograms of length distributions
        hist_domains_mst = numpy.zeros( max_length + 1, numpy.float)
        hist_domains_full_singletons = numpy.zeros( max_length + 1, numpy.float)
        hist_domains_partial_singletons = numpy.zeros( max_length + 1, numpy.float)
        hist_sequences = numpy.zeros( max_length + 1, numpy.float)
        for x in seq_lengths: hist_sequences[x] += 1

        for nid in nids:
             length = self.mFasta.getLength( nid )
             id = self.mMapNid2Id[ nid ]

             if nid not in nid2domains:
                 family_id += 1
                 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                          ( id, 0, length, self.mPatternFamily % family_id ) )
                 family2domains[ self.mPatternFamily % family_id ].append( (nid, length) )
                 nfull_singletons += 1
                 hist_domains_full_singletons[length] += 1
                 continue

             domains = nid2domains[nid]
             domains.sort()

             last = 0
             for start, end, family in domains:
                 hist_domains_mst[end-start] += 1

                 if start - last > self.mMinDomainSize:
                     family_id += 1
                     self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                              ( id, last, start, self.mPatternFamily % family_id ) )

                     npartial_singletons += 1
                     family2domains[ self.mPatternFamily % family_id ].append( (nid, start-last) )
                     ndomains += 1
                     hist_domains_partial_singletons[start-last] += 1

                 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                          ( id, start, end, family ) )
                 
                 last = end
                 ndomains += 1

             if length - last > self.mMinDomainSize:
                 family_id += 1
                 self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                                     ( id, last, length, self.mPatternFamily % family_id ) )
                 npartial_singletons += 1
                 family2domains[ self.mPatternFamily % family_id ].append( (nid, start-last) )
                 hist_domains_partial_singletons[start-last] += 1
                 ndomains += 1

        self.info( "output: nsequences=%i, ndomains=%i,nfamilies=%i, nfull_singletons=%i, npartial_singletons=%i" % (len(nids), ndomains, len(family2domains), npartial_singletons, nfull_singletons))

        self.mOutfileFamilies.write( "family\tnunits\tnsequences\tnresidues\tlength\tlength_median\tlength_stddev\n" )

        family_size_sequences, family_size_domains = [], []

        for family in sorted(family2domains.keys()):
            nids = set()
            lengths = []

            for nid, length in family2domains[family]:
                lengths.append( length )
                nids.add(nid)

            ndomains = len(lengths)
                
            self.mOutfileFamilies.write( "\t".join( (family,
                                                     str(ndomains),
                                                     str(len(nids)),
                                                     str(sum(lengths)),
                                                     "%5.2f" % numpy.mean(lengths),
                                                     "%5.2f" % numpy.median(lengths),
                                                     "%5.2f" % numpy.std(lengths) ) ) + "\n" )

            family_size_sequences.append( len(nids) )
            family_size_domains.append( ndomains )


        if PLOT:
            ## output length distributions
            lines, legends = [], []
            for title, vals in (
                ("sequences", hist_sequences), 
                ("domains", hist_domains_mst), 
                ("partial singletons", hist_domains_full_singletons),
                ("full singletons", hist_domains_partial_singletons), ):

                vv = numpy.zeros( max_length )
                for x in range( 0, max_length, 10 ):
                    vv[x] = sum( vals[x:x+10] )
                x = numpy.flatnonzero( vv > 0 )
                s = sum(vals)
                if s > 0: vv /= s

                lines.append( pylab.plot( x, vv[x] ) )
                legends.append( title )

            pylab.xlabel( "sequence or domain length / residues" )
            pylab.ylabel( "relative frequency" )
            pylab.legend( lines, legends )
            pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_domainsizes_all.png" ) )

            pylab.xlim( 0, 2000 )
            pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_domainsizes_small.png" ) )

            pylab.xlim( max_length - max_length // 4, max_length + 1 )
            pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_domainsize_large.png" ) )

            pylab.clf()

            ## output domain family sizes
            lines = []
            (yvals, xvals) = numpy.histogram( family_size_sequences, bins=50, new = True)
            lines.append( pylab.loglog( xvals[:-1], yvals ) )
            (yvals, xvals) = numpy.histogram( family_size_domains, bins=50, new = True)
            lines.append( pylab.loglog( xvals[:-1], yvals ) )

            pylab.legend( lines, ( "sequeces", "domains") )
            pylab.xlabel( "sequences/domains per family" )
            pylab.ylabel( "relative frequency" )
            pylab.savefig( os.path.expanduser( self.mFilenameDomains + "_familysizes.png" ) )
Пример #27
0
    def applyMethod(self):
        """apply the method.
        """

        infile = SegmentedFile.openfile(self.mFilenameClusters, "r")

        family2domains = collections.defaultdict(list)
        nid2domains = collections.defaultdict(list)

        ndomains = 0
        for line in infile:
            if line[0] == "#": continue
            if line.startswith("nid"): continue
            nid, start, end, family = line[:-1].split("\t")
            nid = int(nid)
            nid2domains[nid].append((int(start), int(end), family))
            family2domains[family].append((nid, int(end) - int(start)))
            ndomains += 1

        self.info( "collected: nsequences=%i, ndomains=%i, nfamilies=%i" %\
                       (len(nid2domains), ndomains, len(family2domains) ) )

        family_id = len(family2domains)

        self.mOutfile.write("nid\tstart\tend\tfamily\n")

        # output domains per nid
        seqs = self.mFasta.getContigSizes()
        nids = sorted(seqs.keys())

        nfull_singletons = 0
        npartial_singletons = 0
        ndomains = 0

        # compute stats at the same time
        seq_lengths = seqs.values()
        max_length = max(seq_lengths)
        # compute summary per family
        # and compute full histograms of length distributions
        hist_domains_mst = numpy.zeros(max_length + 1, numpy.float)
        hist_domains_full_singletons = numpy.zeros(max_length + 1, numpy.float)
        hist_domains_partial_singletons = numpy.zeros(max_length + 1,
                                                      numpy.float)
        hist_sequences = numpy.zeros(max_length + 1, numpy.float)
        for x in seq_lengths:
            hist_sequences[x] += 1

        for nid in nids:
            length = self.mFasta.getLength(nid)
            id = self.mMapNid2Id[nid]

            if nid not in nid2domains:
                family_id += 1
                self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                         ( id, 0, length, self.mPatternFamily % family_id ) )
                family2domains[self.mPatternFamily % family_id].append(
                    (nid, length))
                nfull_singletons += 1
                hist_domains_full_singletons[length] += 1
                continue

            domains = nid2domains[nid]
            domains.sort()

            last = 0
            for start, end, family in domains:
                hist_domains_mst[end - start] += 1

                if start - last > self.mMinDomainSize:
                    family_id += 1
                    self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                             ( id, last, start, self.mPatternFamily % family_id ) )

                    npartial_singletons += 1
                    family2domains[self.mPatternFamily % family_id].append(
                        (nid, start - last))
                    ndomains += 1
                    hist_domains_partial_singletons[start - last] += 1

                self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                         ( id, start, end, family ) )

                last = end
                ndomains += 1

            if length - last > self.mMinDomainSize:
                family_id += 1
                self.mOutfile.write( "%s\t%s\t%s\t%s\n" % \
                                                    ( id, last, length, self.mPatternFamily % family_id ) )
                npartial_singletons += 1
                family2domains[self.mPatternFamily % family_id].append(
                    (nid, start - last))
                hist_domains_partial_singletons[start - last] += 1
                ndomains += 1

        self.info(
            "output: nsequences=%i, ndomains=%i,nfamilies=%i, nfull_singletons=%i, npartial_singletons=%i"
            % (len(nids), ndomains, len(family2domains), npartial_singletons,
               nfull_singletons))

        self.mOutfileFamilies.write(
            "family\tnunits\tnsequences\tnresidues\tlength\tlength_median\tlength_stddev\n"
        )

        family_size_sequences, family_size_domains = [], []

        for family in sorted(family2domains.keys()):
            nids = set()
            lengths = []

            for nid, length in family2domains[family]:
                lengths.append(length)
                nids.add(nid)

            ndomains = len(lengths)

            self.mOutfileFamilies.write("\t".join(
                (family, str(ndomains), str(len(nids)), str(sum(lengths)),
                 "%5.2f" % numpy.mean(lengths), "%5.2f" %
                 numpy.median(lengths), "%5.2f" % numpy.std(lengths))) + "\n")

            family_size_sequences.append(len(nids))
            family_size_domains.append(ndomains)

        if PLOT:
            ## output length distributions
            lines, legends = [], []
            for title, vals in (
                ("sequences", hist_sequences),
                ("domains", hist_domains_mst),
                ("partial singletons", hist_domains_full_singletons),
                ("full singletons", hist_domains_partial_singletons),
            ):

                vv = numpy.zeros(max_length)
                for x in range(0, max_length, 10):
                    vv[x] = sum(vals[x:x + 10])
                x = numpy.flatnonzero(vv > 0)
                s = sum(vals)
                if s > 0: vv /= s

                lines.append(pylab.plot(x, vv[x]))
                legends.append(title)

            pylab.xlabel("sequence or domain length / residues")
            pylab.ylabel("relative frequency")
            pylab.legend(lines, legends)
            pylab.savefig(
                os.path.expanduser(self.mFilenameDomains +
                                   "_domainsizes_all.png"))

            pylab.xlim(0, 2000)
            pylab.savefig(
                os.path.expanduser(self.mFilenameDomains +
                                   "_domainsizes_small.png"))

            pylab.xlim(max_length - max_length // 4, max_length + 1)
            pylab.savefig(
                os.path.expanduser(self.mFilenameDomains +
                                   "_domainsize_large.png"))

            pylab.clf()

            ## output domain family sizes
            lines = []
            (yvals, xvals) = numpy.histogram(family_size_sequences,
                                             bins=50,
                                             new=True)
            lines.append(pylab.loglog(xvals[:-1], yvals))
            (yvals, xvals) = numpy.histogram(family_size_domains,
                                             bins=50,
                                             new=True)
            lines.append(pylab.loglog(xvals[:-1], yvals))

            pylab.legend(lines, ("sequeces", "domains"))
            pylab.xlabel("sequences/domains per family")
            pylab.ylabel("relative frequency")
            pylab.savefig(
                os.path.expanduser(self.mFilenameDomains + "_familysizes.png"))
Пример #28
0
 def create(self):
     outfile = SegmentedFile.openfile( self.mFilename, "w" )
     for x in range(10):
         outfile.write( "%i\n" % x )
     outfile.close()