def maskSequences(self, sequences): '''mask a collection of sequences.''' outfile, infile = tempfile.mkstemp() for x, s in enumerate(sequences): os.write(outfile, ">%i\n%s\n" % (x, s)) os.close(outfile) statement = self.mCommand % locals() E.debug("statement: %s" % statement) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise RuntimeError( "Error in running %s \n%s\nTemporary directory" % (statement, err)) result = [ x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out)) ] os.remove(infile) return result
def buildPFAMDomains( infiles, outfile ): '''map PFAM domains onto current sequence collection. The mapping is done by ID lookup.''' infile = infiles[0] with IOTools.openFile( "nrdb50.fasta.tsv") as inf: reader = csv.DictReader( inf, dialect='excel-tab' ) map_id2nid = {} for row in reader: map_id2nid[row['repid']] = row['nid'] rx = re.compile( "(\S+)\/(\d+)-(\d+)\s+(\S+);(.*);" ) c = E.Counter() outf = IOTools.openFile( outfile, "w" ) with IOTools.openFile( infile ) as inf: for entry in FastaIterator.iterate( inf ): c.input += 1 pid, start, end, pfam_id, description = rx.match( entry.title ).groups() try: outf.write( "%s\t%i\t%i\t%s\n" % (map_id2nid[pid], int(start)-1, int(end), pfam_id ) ) except KeyError: c.missed += 1 continue c.output += 1 outf.close() E.info( c )
def maskSequences( self, sequences ): '''mask a collection of sequences.''' outfile, infile = tempfile.mkstemp() for x,s in enumerate(sequences): os.write(outfile, ">%i\n%s\n" % (x,s) ) os.close(outfile) statement = self.mCommand % locals() E.debug( "statement: %s" % statement ) s = subprocess.Popen( statement, shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE, close_fds = True) (out, err) = s.communicate() if s.returncode != 0: raise RuntimeError("Error in running %s \n%s\nTemporary directory" % (statement, err)) result = [ x.sequence for x in FastaIterator.iterate( StringIO.StringIO( out) ) ] os.remove( infile ) return result
def collectGenomeSizes(infile, outfile): ''' output the genome sizes for each genome ''' to_cluster = True outf = open(outfile, "w") outf.write("genome\tlength\n") # assume single fasta entry for fasta in FastaIterator.iterate(iotools.openFile(infile)): name = P.snip(os.path.basename(infile), ".fna") length = len(list(fasta.sequence)) outf.write("%s\t%s\n" % (name, str(length))) outf.close()
def collectGenomeSizes(infile, outfile): ''' output the genome sizes for each genome ''' to_cluster = True outf = open(outfile, "w") outf.write("genome\tlength\n") # assume single fasta entry for fasta in FastaIterator.iterate(IOTools.openFile(infile)): name = P.snip(os.path.basename(infile), ".fna") length = len(list(fasta.sequence)) outf.write("%s\t%s\n" % (name, str(length))) outf.close()
def buildNrdb50( infile, outfile ): '''build nrdb50 Renumber seqences.''' outf_fasta = IOTools.openFile( outfile, "w" ) outf_table = IOTools.openFile( outfile + ".tsv", "w" ) outf_table.write("nid\tpid\thid\tdescription\tcluster_size\ttaxon\trepid\n" ) rx = re.compile( "(\S+) (.*) n=(\d+) Tax=(.*) RepID=(\S+)" ) nid = 1 for entry in FastaIterator.iterate( IOTools.openFile( infile )): outf_fasta.write(">%i\n%s\n" % (nid, entry.sequence ) ) cluster_name, description, cluster_size, taxon, repid = rx.match( entry.title ).groups() hid = computeHID( entry.sequence ) outf_table.write( "\t".join( (str(nid), cluster_name, hid, description, cluster_size, taxon, repid)) + "\n" ) nid += 1 outf_fasta.close() outf_table.close()
def checkBlastRun( infiles, outfile ): '''build summary stats on file.''' pairsdbfile, seqfile = infiles nids = set() with IOTools.openFile( seqfile ) as inf: for r in FastaIterator.iterate( inf ): nids.add( int(r.title) ) with IOTools.openFile( pairsdbfile ) as inf: query_ids, sbjct_ids = set(), set() total_results, self_links = 0, 0 for l in inf: l = inf.readline() if l.startswith("#//"): continue query_id, sbjct_id = l.split("\t")[:2] query_ids.add( int(query_id) ) sbjct_ids.add( int(sbjct_id) ) if query_id == sbjct_id: self_links += 1 total_results += 1 outf = IOTools.openFile( outfile, "w" ) outf.write( "category\tcounts\n") outf.write( "\t".join( map(str, ('nids', len(nids)))) + "\n" ) outf.write( "\t".join( map(str, ('links', total_results))) + "\n" ) outf.write( "\t".join( map(str, ('self', self_links))) + "\n" ) outf.write( "\t".join( map(str, ('queries', len(query_ids)))) + "\n" ) outf.write( "\t".join( map(str, ('sbjcts', len(sbjct_ids)))) + "\n" ) outf.close() outf = IOTools.openFile( outfile + '.missing_queries.gz', 'w' ) outf.write( 'nid\n' ) outf.write( "\n".join( map(str, sorted( list( nids.difference( query_ids )) ) )) + "\n" ) outf.close() outf = IOTools.openFile( outfile + '.missing_sbjcts.gz', 'w' ) outf.write( 'nid\n' ) outf.write( "\n".join( map(str, sorted( list( nids.difference( sbjct_ids )) ) )) + "\n" ) outf.close()
def buildSCOPDomains( infiles, outfile ): '''reconcile mapped domains into a single domain file. * fragments are removed - a domain must map at least 90% of its length. * domains overlapping on the same sequence with the same superfamily classification are merged. ''' linksfile, fastafile = infiles # filtering criteria min_coverage = 0.9 # only take first four fold classes classes = 'abcd' rx = re.compile('(\S+)\s(\S+)\s(.*)' ) id2class = {} with IOTools.openFile( fastafile ) as inf: for x in FastaIterator.iterate( inf ): pid, cls, description = rx.match(x.title).groups() id2class[pid] = (cls, len(x.sequence) ) E.info('read mappings for %i sequences' % len(id2class)) counter = E.Counter() with IOTools.openFile( linksfile ) as inf: nid2domains = collections.defaultdict( list ) ndomains = 0 for line in inf: if line.startswith('query_nid'): continue if line.startswith('#'): continue counter.links += 1 domain_id, nid, evalue, domain_start, domain_end, sbjct_start, sbjct_end, \ block_sizes, domain_starts, sbjct_starts, \ bitscore, pid = line[:-1].split() nid, domain_start, domain_end, sbjct_start, sbjct_end = map(int, \ ( nid, domain_start, domain_end, sbjct_start, sbjct_end )) family, length = id2class[domain_id] cls, fold, superfamily, family = family.split('.') if cls not in classes: continue if float(domain_end - domain_start) / length < min_coverage: continue counter.unmerged_domains += 1 superfamily = '00%c%03i%03i' % (cls, int(fold), int(superfamily)) nid2domains[nid].append( (superfamily, sbjct_start, sbjct_end ) ) counter.sequences = len(nid2domains) E.info( 'merging %i domains in %i sequences' % (counter.unmerged_domains, counter.sequences)) outf = IOTools.openFile( outfile, 'w' ) outf.write('nid\tstart\tend\tfamily\n') for nid, dd in sorted(nid2domains.iteritems()): for family, domains in itertools.groupby( dd, key = lambda x: x[0] ): unmerged_domains = [ (x[1],x[2]) for x in domains ] merged_domains = Intervals.combine( unmerged_domains ) for start, end in merged_domains: counter.domains += 1 outf.write( '%i\t%i\t%i\t%s\n' % (nid, start, end, family ) ) outf.close() E.info( counter )