Пример #1
0
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x, s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x, s))

        os.close(outfile)

        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [
            x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))
        ]

        os.remove(infile)

        return result
Пример #2
0
def buildPFAMDomains( infiles, outfile ):
    '''map PFAM domains onto current sequence collection. 
    The mapping is done by ID lookup.'''
    
    infile = infiles[0]
    with IOTools.openFile( "nrdb50.fasta.tsv") as inf:

        reader = csv.DictReader( inf, dialect='excel-tab' )
        map_id2nid = {}
        for row in reader:
            map_id2nid[row['repid']] = row['nid']
    
    rx = re.compile( "(\S+)\/(\d+)-(\d+)\s+(\S+);(.*);" )

    c = E.Counter()
    outf = IOTools.openFile( outfile, "w" )
    with IOTools.openFile( infile ) as inf:
        for entry in FastaIterator.iterate( inf ):
            c.input += 1
            pid, start, end, pfam_id, description = rx.match( entry.title ).groups()
            try:
                outf.write( "%s\t%i\t%i\t%s\n" % (map_id2nid[pid], int(start)-1, int(end), pfam_id ) )
            except KeyError:
                c.missed += 1
                continue
            c.output += 1

    outf.close()
    E.info( c )
Пример #3
0
    def maskSequences( self, sequences ):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x,s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x,s) )
                     
        os.close(outfile)
                     
        statement = self.mCommand % locals()

        E.debug( "statement: %s" % statement )

        s = subprocess.Popen( statement,
                              shell = True,
                              stdout = subprocess.PIPE,
                              stderr = subprocess.PIPE,
                              close_fds = True)                              

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError("Error in running %s \n%s\nTemporary directory" % (statement, err))

        result = [ x.sequence for x in FastaIterator.iterate( StringIO.StringIO( out) ) ]

        os.remove( infile )
        
        return result
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(iotools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
Пример #6
0
def buildNrdb50( infile, outfile ):
    '''build nrdb50
    
    Renumber seqences.'''
    
    outf_fasta = IOTools.openFile( outfile, "w" )
    outf_table = IOTools.openFile( outfile + ".tsv", "w" )
    outf_table.write("nid\tpid\thid\tdescription\tcluster_size\ttaxon\trepid\n" )

    rx = re.compile( "(\S+) (.*) n=(\d+) Tax=(.*) RepID=(\S+)" )

    nid = 1
    for entry in FastaIterator.iterate( IOTools.openFile( infile )):
        outf_fasta.write(">%i\n%s\n" % (nid, entry.sequence ) )
        cluster_name, description, cluster_size, taxon, repid = rx.match( entry.title ).groups()
        hid = computeHID( entry.sequence )
        outf_table.write( "\t".join( (str(nid), cluster_name, hid, description, cluster_size, taxon, repid)) + "\n" )
        nid += 1

    outf_fasta.close()
    outf_table.close()
Пример #7
0
def checkBlastRun( infiles, outfile ):
    '''build summary stats on file.'''

    pairsdbfile, seqfile = infiles
    
    nids = set()
    with IOTools.openFile( seqfile ) as inf:
        for r in FastaIterator.iterate( inf ):
            nids.add( int(r.title) )

    with IOTools.openFile( pairsdbfile ) as inf:
        query_ids, sbjct_ids = set(), set()
        total_results, self_links = 0, 0
        for l in inf:
            l = inf.readline()
            if l.startswith("#//"): continue
            query_id, sbjct_id = l.split("\t")[:2]
            query_ids.add( int(query_id) )
            sbjct_ids.add( int(sbjct_id) )
            if query_id == sbjct_id: self_links += 1
            total_results += 1

    outf = IOTools.openFile( outfile, "w" )
    outf.write( "category\tcounts\n")
    outf.write( "\t".join( map(str, ('nids', len(nids)))) + "\n" )
    outf.write( "\t".join( map(str, ('links', total_results))) + "\n" )
    outf.write( "\t".join( map(str, ('self', self_links))) + "\n" )
    outf.write( "\t".join( map(str, ('queries', len(query_ids)))) + "\n" )
    outf.write( "\t".join( map(str, ('sbjcts', len(sbjct_ids)))) + "\n" )
    outf.close()

    outf = IOTools.openFile( outfile + '.missing_queries.gz', 'w' )
    outf.write( 'nid\n' )
    outf.write( "\n".join( map(str, sorted( list( nids.difference( query_ids )) ) )) + "\n" )
    outf.close()

    outf = IOTools.openFile( outfile + '.missing_sbjcts.gz', 'w' )
    outf.write( 'nid\n' )
    outf.write( "\n".join( map(str, sorted( list( nids.difference( sbjct_ids )) ) )) + "\n" )
    outf.close()
Пример #8
0
def buildSCOPDomains( infiles, outfile ):
    '''reconcile mapped domains into a single domain file.

    * fragments are removed - a domain must map at least 90%
      of its length.

    * domains overlapping on the same sequence with the same
      superfamily classification are merged.
    '''
    
    linksfile, fastafile = infiles

    # filtering criteria
    min_coverage = 0.9
    # only take first four fold classes
    classes = 'abcd'

    rx = re.compile('(\S+)\s(\S+)\s(.*)' )
    id2class = {}
    with IOTools.openFile( fastafile ) as inf:
        for x in FastaIterator.iterate( inf ):
            pid, cls, description = rx.match(x.title).groups()
            id2class[pid] = (cls, len(x.sequence) )
            
    E.info('read mappings for %i sequences' % len(id2class))
    counter = E.Counter()

    with IOTools.openFile( linksfile ) as inf:
        nid2domains = collections.defaultdict( list )
        ndomains = 0
        for line in inf:
            if line.startswith('query_nid'): continue
            if line.startswith('#'): continue
            counter.links += 1
            
            domain_id, nid, evalue, domain_start, domain_end, sbjct_start, sbjct_end, \
                block_sizes, domain_starts, sbjct_starts, \
                bitscore, pid = line[:-1].split()
            
            nid, domain_start, domain_end, sbjct_start, sbjct_end = map(int, \
                                                                       ( nid, domain_start, domain_end, sbjct_start, sbjct_end ))

            family, length = id2class[domain_id]

            cls, fold, superfamily, family = family.split('.')
            if cls not in classes: continue
            if float(domain_end - domain_start) / length < min_coverage: continue
            counter.unmerged_domains += 1
            superfamily = '00%c%03i%03i' % (cls, int(fold), int(superfamily))

            nid2domains[nid].append( (superfamily, sbjct_start, sbjct_end ) )

        counter.sequences = len(nid2domains)

    E.info( 'merging %i domains in %i sequences' % (counter.unmerged_domains, counter.sequences))

    outf = IOTools.openFile( outfile, 'w' )
    outf.write('nid\tstart\tend\tfamily\n')
    for nid, dd in sorted(nid2domains.iteritems()):
        for family, domains in itertools.groupby( dd, key = lambda x: x[0] ):
            unmerged_domains = [ (x[1],x[2]) for x in domains ]
            merged_domains = Intervals.combine( unmerged_domains )
            for start, end in merged_domains:
                counter.domains += 1
                outf.write( '%i\t%i\t%i\t%s\n' % (nid, start, end, family ) )
    outf.close()

    E.info( counter )