def blastContextGenes(self,njobs=2): print "Blasting Context Genes" """ First split up the main bacteriocin file into a bunch of smaller files""" split_fastafiles = ["%s/context.%d"%(self.intermediate,i) for i in xrange(njobs)] #self.split_files += split_fastafiles split_fastahandles = [open(f,'w') for f in split_fastafiles] out_classes = ["%s/contextout.%d"%(self.intermediate,i) for i in xrange(njobs)] index=0 for record in SeqIO.parse(self.cand_context_genes_fasta,"fasta"): if len(record.seq)<=1: continue #To weed out weird entries split_fastahandles[index].write(">%s\n%s\n"%(str(record.id), fasta.format(str(record.seq)))) index=(index+1)%njobs #Close files for handle in split_fastahandles: handle.close() context_cmd = ' '.join([ """module load anaconda; module load blast;module load blast+;""", """python %s/src/genome/context_gene.py""", """--training-directory=%s""", """--training-labels=%s""", """--query=%s""", """--intermediate=%s""", """--num-threads=%d""", """--output=%s""" ]) """ Release jobs """ jobs = [] for i in xrange(njobs): cmd = context_cmd%(self.rootdir, self.training_directory, self.training_labels, split_fastafiles[i], self.intermediate, self.numThreads, out_classes[i] ) batch_file = "%s/context_blast%i.%d.job"%(os.getcwd(),i,os.getpid()) self.batch_files.append(batch_file) proc = quorum.Popen( cmd,shell=True,batch_file=batch_file, stdin=quorum.PIPE,stdout=quorum.PIPE ,threads=self.numThreads) proc.submit() #proc.output = out_classes[i] jobs.append(proc) self.jobs.append(proc) for job in jobs: job.wait() """ Collect all of the results from the jobs""" context_out = open(self.blast_context_out,'w') for i in xrange(njobs): if os.path.exists(out_classes[i]): shutil.copyfileobj(open(out_classes[i]),context_out) context_out.close() pass
def reformat(orgfaa,faafile): outhandle = open(faafile,'w') for record in SeqIO.parse(open(orgfaa,'r'),'fasta'): gi,num,dbtype,protid,_ = record.id.split("|") outhandle.write(">%s\n%s\n"%(protid,fasta.format(str(record.seq)))) outhandle.close()
def getFasta(txt,fastadb,fastaindex,fastaout): outhandle = open(fastaout,'w') indexer = fasta.Indexer(fastadb,fastaindex) indexer.load() i = 0 with open(txt,'r') as handle: for ln in handle: ln = ln.rstrip() toks = ln.split('|') acc,clrname,full_evalue,hmm_st,hmm_end,env_st,env_end,description=toks full_evalue = float(full_evalue) hmm_st,hmm_end,env_st,env_end = map(int,[hmm_st,hmm_end,env_st,env_end]) seq = indexer.fetch(acc,env_st,env_end) seq = fasta.format(seq) outhandle.write(">%s:%d %s\n%s\n"%(acc,i,description,seq)) i+=1
def writeClusters(self,clusters,seq_dict,outhandle): clusternum=0 for cluster in clusters: for gene in cluster: acc,clrname,full_evalue,hmm_st,hmm_end,env_st,env_end,description,strand,protid=gene.split("|") hmm_st,hmm_end,env_st,env_end = map(int,[hmm_st,hmm_end,env_st,env_end]) full_evalue = float(full_evalue) function = clrname.split('.')[0] seq = seq_dict[(acc,clrname,full_evalue,hmm_st,hmm_end,env_st,env_end,description,strand,protid)] outhandle.write(">accession=%s|function=%s|start=%s|end=%s|strand=%s|score=%s|protein_id=%s|cluster_%d|%s\n%s\n"% (acc,function,env_st,env_end,strand,str(full_evalue),protid,clusternum,description,fasta.format(seq))) clusternum+=1