Пример #1
0
    def blastContextGenes(self,njobs=2):
        print "Blasting Context Genes"
        """ First split up the main bacteriocin file into a bunch of smaller files"""
        split_fastafiles = ["%s/context.%d"%(self.intermediate,i)
                          for i in xrange(njobs)]
        #self.split_files += split_fastafiles
        split_fastahandles = [open(f,'w') for f in split_fastafiles]
        out_classes = ["%s/contextout.%d"%(self.intermediate,i) for i in xrange(njobs)]
        index=0
        for record in SeqIO.parse(self.cand_context_genes_fasta,"fasta"):
            if len(record.seq)<=1: continue #To weed out weird entries
            split_fastahandles[index].write(">%s\n%s\n"%(str(record.id),
                                                         fasta.format(str(record.seq))))
            index=(index+1)%njobs
        #Close files
        for handle in split_fastahandles: handle.close()

        context_cmd = ' '.join([
                                 """module load anaconda; module load blast;module load blast+;""",
                                 """python %s/src/genome/context_gene.py""",
                                 """--training-directory=%s""",
                                 """--training-labels=%s""",
                                 """--query=%s""",
                                 """--intermediate=%s""",
                                 """--num-threads=%d""",
                                 """--output=%s"""         
                                 ])    
        
        """ Release jobs """
        jobs = []
        for i in xrange(njobs):
            cmd = context_cmd%(self.rootdir,
                               self.training_directory,
                               self.training_labels,
                               split_fastafiles[i],
                               self.intermediate,
                               self.numThreads,
                               out_classes[i]
                               )
            
            batch_file = "%s/context_blast%i.%d.job"%(os.getcwd(),i,os.getpid())
            self.batch_files.append(batch_file)
            
            proc = quorum.Popen( cmd,shell=True,batch_file=batch_file,
                                 stdin=quorum.PIPE,stdout=quorum.PIPE ,threads=self.numThreads) 
            proc.submit()
            #proc.output = out_classes[i]
            jobs.append(proc)
            self.jobs.append(proc)
        for job in jobs: job.wait()
        """ Collect all of the results from the jobs"""
        context_out = open(self.blast_context_out,'w')

        for i in xrange(njobs):
            if os.path.exists(out_classes[i]):
                shutil.copyfileobj(open(out_classes[i]),context_out)
        context_out.close()    
        pass
Пример #2
0
def reformat(orgfaa,faafile):
    outhandle = open(faafile,'w')
    for record in SeqIO.parse(open(orgfaa,'r'),'fasta'):
        gi,num,dbtype,protid,_ = record.id.split("|")
        outhandle.write(">%s\n%s\n"%(protid,fasta.format(str(record.seq))))
    outhandle.close()
    
    
    

    
    
    
Пример #3
0
def getFasta(txt,fastadb,fastaindex,fastaout):
    outhandle = open(fastaout,'w')
    indexer = fasta.Indexer(fastadb,fastaindex)
    indexer.load()
    i = 0
    with open(txt,'r') as handle:
        for ln in handle:
            ln = ln.rstrip()
            toks = ln.split('|')
            acc,clrname,full_evalue,hmm_st,hmm_end,env_st,env_end,description=toks
            full_evalue = float(full_evalue)
            hmm_st,hmm_end,env_st,env_end = map(int,[hmm_st,hmm_end,env_st,env_end])
            seq = indexer.fetch(acc,env_st,env_end)
            seq = fasta.format(seq)
            outhandle.write(">%s:%d %s\n%s\n"%(acc,i,description,seq))
            i+=1
Пример #4
0
 def writeClusters(self,clusters,seq_dict,outhandle):
     clusternum=0
     for cluster in clusters:
         for gene in cluster:
             acc,clrname,full_evalue,hmm_st,hmm_end,env_st,env_end,description,strand,protid=gene.split("|")
             hmm_st,hmm_end,env_st,env_end = map(int,[hmm_st,hmm_end,env_st,env_end])
             full_evalue = float(full_evalue)
             function = clrname.split('.')[0]
             seq = seq_dict[(acc,clrname,full_evalue,hmm_st,hmm_end,env_st,env_end,description,strand,protid)]
             outhandle.write(">accession=%s|function=%s|start=%s|end=%s|strand=%s|score=%s|protein_id=%s|cluster_%d|%s\n%s\n"%
                             (acc,function,env_st,env_end,strand,str(full_evalue),protid,clusternum,description,fasta.format(seq))) 
         clusternum+=1