Пример #1
0
 def endFork(self,fdict):   ### Ends fork, tidies and sets new one running
     '''Ends fork, tidies and sets new one running.'''
     try:### ~ [1] ~ End and tidy current job ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if 'ResFile' in fdict:
             for resfile in fdict['ResFile']:
                 fromfile = '%s.%s' % (fdict['FID'],resfile)
                 if not rje.exists(fromfile): continue #self.warnLog('Results file %s missing!' % fromfile); continue
                 tofile = '%s.%s' % (self.baseFile(),resfile)
                 if rje.exists(tofile): open(tofile,'a').writelines(open(fromfile,'r').readlines()[1:])
                 else: rje.fileTransfer(fromfile,tofile)
         if 'Log' in fdict:
             if 'cmd' in fdict:
                 open(self.log.info['LogFile'],'a').writelines(open(fdict['Log'],'r').readlines()[5:-1])
                 os.unlink(fdict['Log'])
             else: rje.fileTransfer(fdict['Log'],self.log.info['LogFile'])
             if self.getBool('LogFork'):
                 self.printLog('#END','Fork %s ended: log content transferred' % fdict['PID'])
                 self.printLog('#~~#','#~~#',timeout=False)
             #if self.dev(): self.deBug(fdict['Log'])
             #if self.dev(): self.deBug(rje.exists(fdict['Log']))
         elif 'PID' in fdict and string.split('%s' % fdict['PID'])[0] == 'WAIT': pass
         else: self.printLog('#END','Fork %s ended.' % fdict['PID'])
     except IOError:
         if self.getInt('IOError') == 1: self.errorLog('Forker.endFork IOError limit reached'); raise
         else: self.int['IOError'] -= 1; self.errorLog('Forker.endFork')
     except: self.errorLog('Forker.endFork error')
     self.nextFork()   # Carry on regardless
Пример #2
0
 def restFullOutput(self,maxparsesize=0):   ### Returns full REST output from file
     '''Returns full REST output from file.'''
     if rje.exists(self.getStr('RestIn')) and not self.force(): return open(self.getStr('RestIn'),'r').read()
     try: jobid = self.dict['Output']['jobid']
     except: jobid = None
     rtxt = '%s\n' % self.dict['Output']['intro']
     for rkey in self.list['RestKeys']:
         rtxt += '###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###\n'
         rtxt += '# %s: %s\n' % (rkey,self.dict['Outfile'][rkey])
         #?# Q. Why only if jobid? Is it not always good to replace with content?
         if jobid and rje.exists(string.split(self.dict['Output'][rkey],'\n')[0]): ### File given instead of content
             rfile = string.split(self.dict['Output'][rkey],'\n')[0]
             fext = string.split(rfile,'.')[-1]
             nbytes = os.path.getsize(rfile)
             if nbytes > maxparsesize > 0:   # Too large to parse
                 otext = '%s is too large to return (%s > %s)' % (os.path.basename(rfile),rje.humanByteSize(nbytes),rje.humanByteSize(maxparsesize))
                 resturl = '%sretrieve&jobid=%s&rest=%s[&password=X]' % (self.getStr('RestURL'),jobid,rkey)
                 rtxt += '%s in full output. Try %s.' % (otext,resturl)
             elif rfile.endswith('.png'):
                 rtxt += '%s\n' % rfile
                 #rtxt += 'Cannot return graphic in full output\n'
             #elif fext in ['htm','html']:
             #    rtxt += 'Cannot return HTML in full output\n'
             else:
                 outtxt = open(rfile,'r').read()
                 if not outtxt.endswith('\n'): outtxt += '\n'
                 rtxt += outtxt
         else: rtxt += '%s\n' % self.dict['Output'][rkey]
     return rtxt
Пример #3
0
 def sgd2sp(self):   ### Reformats yeast sequence names and outputs new data for GOPHER
     '''Reformats yeast sequence names and outputs new data for GOPHER.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         inseq = self.obj['SeqList']
         uni = rje_uniprot.UniProt(self.log,self.cmd_list+['datout=None'])
         xref = self.db('XRef')
         self.dict['Rename'] = {}
         ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         ufile = '%s.dat' % self.info['Basefile']
         if os.path.exists(ufile) and not self.opt['Force']: uni.readUniProt(ufile,clear=True,cleardata=False)
         else:
             uni.readUniProt(clear=True,acclist=rje.sortKeys(xref.index('UniProt')),cleardata=False)
             uni.saveUniProt(ufile)
         ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniseq = {}
         for entry in uni.entries():
             seq = entry.obj['Sequence']
             uniseq[seq.info['AccNum']] = seq
         self.printLog('\r#USEQ','%s UniProt Sequences extracted (%s Ensembl AccNum)' % (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt')))))
         ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         yseq = []       # List of YEAST sequence objects
         (sx,stot) = (0.0,inseq.seqNum())
         for seq in inseq.seqs():
             self.progLog('\r#SEQ','Reformatting sequence names: %.2f%%' % (sx/stot)); sx += 100.0
             if seq.info['SpecCode'] != 'YEAST': continue
             yseq.append(seq)
             sgd = seq.info['AccNum']; newname = seq.info['Name']
             try:
                 for x in xref.indexEntries('EnsG',sgd):
                     acc = x['UniProt']
                     if acc: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD'],acc)
                     else: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD']); continue
                     if acc not in uniseq: self.printLog('\r#UNIERR','Unable to find UniProt sequence %s (%s)' % (acc,sgd)); continue
                     useq = uniseq[acc]
                     if useq.info['Sequence'] != seq.info['Sequence']: self.printLog('\r#SEQERR','%s sequence <> %s sequence' % (sgd,acc)); continue
                     nsplit = string.split(newname)
                     nsplit[0] = '%s__%s' % (x['UniprotID'],acc)
                     newname = string.join(nsplit)
                     self.dict['Rename'][sgd] = acc
                     break
             except: self.errorLog('%s problem' % sgd)
             seq.info['Name'] = newname
             seq.extractDetails(gnspacc=True)
         self.printLog('\r#SEQ','Reformatting sequence names complete.')
         ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not rje.exists('%s.ygob.fas' % self.info['Basefile']):
             inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile'])
         if not rje.exists('%s.yeast.fas' % self.info['Basefile']):
             inseq.saveFasta(seqs=yseq,seqfile='%s.yeast.fas' % self.info['Basefile'])
         self.list['YeastSeq'] = inseq.accList(yseq)
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
Пример #4
0
    def enrichment(self):   ### Performs final enrichment analysis on SLiMDIP and Random datasets.
        '''
        Performs final enrichment analysis on SLiMDIP and Random datasets. This requires the "real" predicted DMI from
        the slimDIP() method plus the randomised PPI datasets (from randomisePPI()). The latter are also run through the
        slimDIP() method to generate a background distribution. This is used directly to calculate enrichment "p-values"
        but also to generate a summary output file that can be used for generating histograms etc. with slimdip.R.

        This method needs:
        - slimdip table (or *.slimdip.tdt output file to load).
        - randbase.XX.tdt files.
        '''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [0a] Check for required data else run preceding step(s) ~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            # Check for SLiMDIP table of real predicted DMI. Load in slimDIP() method if present and not in Database Table.
            if not self.db('slimdip') and not self.slimDIP(): return False
            # Check for randomised PPI datasets. These should be named randbase.XXX.tdt.
            for r in range(self.getInt('RandPPI')):
                randfile = '%s.%s.tdt' % (self.getStr('RandBase'),rje.preZero(r,self.getInt('RandPPI')-1))
                if not rje.exists(randfile):
                    if not self.randomisePPI(): return False
                    break

            ### ~ [1] Perform Enrichment Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Perform SLiMDIP analysis of each random PPI dataset ~~~~~~~~~~~~~~~~~~~~~~~~~ ##


            return True
        except: self.errorLog('%s.enrichment() error' % self.prog()); return False
Пример #5
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] Check and modify URL if required ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.getStr('RestIn').startswith('http:'):
             #!# Check for rest URL and add if missing
             #!# Split on &
             restcmd = string.split(self.getStr('RestIn'),'&')
             for i in range(len(restcmd)):
                 if '=' not in restcmd[i]: continue
                 (opt,value) = string.split(restcmd[i],'=',1)
                 if value.startswith('file:'):   # Conversion of cmd=file:FILE into cmd=CONTENT
                     rfile = string.split(value,':',1)[1]
                     #!# Consider adding max size constraint. Probably a URL size limit.
                     if rje.exists(rfile):
                         restcmd[i] = '%s=%s' % (opt,rje.chomp(string.join(open(rfile,'r').readlines(),'\\n')))
                         if '&' in restcmd[i]:
                             self.warnLog('%s "&" => "+" conversions for %s.' % (rje.iStr(restcmd[i].count('&')),rfile))
                             restcmd[i] = string.replace(restcmd[i],'&','+')
                     else: self.warnLog('File "%s" not found.' % rfile,quitchoice=True)
             self.setStr({'RestIn':string.join(restcmd,'&')})
         ## ~ [1b] Direct Parsing of output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:   # Convert to file
             self.setStr({'RestIn':rje.makePath(self.getStr('RestIn'),True)})
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Пример #6
0
 def restOutput(self,
                outfmt=None,
                maxparsesize=0):  ### Returns rest output for outfmt
     '''Returns rest output for outfmt.'''
     if not outfmt: outfmt = self.getStrLC('Rest')
     if not outfmt: return 'No REST output'
     if outfmt in self.dict['Output']:
         rfile = string.split(self.dict['Output'][outfmt], '\n')[0]
         if rje.exists(rfile):
             nbytes = os.path.getsize(rfile)
             if nbytes > maxparsesize > 0:  # Too large to parse
                 otext = '%s is too large to return (%s)' % (
                     os.path.basename(rfile), rje.humanByteSize(nbytes))
                 try:
                     jobid = self.dict['Output']['jobid']
                 except:
                     jobid = None
                 resturl = '%sretrieve&jobid=%s&rest=%s[&password=X]' % (
                     self.getStr('RestURL'), jobid, outfmt)
                 if not jobid or outfmt == self.getStrLC('Rest'):
                     return 'ERROR: %s' % (otext)
                 else:
                     return '%s in full output. Try %s.' % (otext, resturl)
             else:
                 return open(rfile, 'r').read()
         return self.dict['Output'][outfmt]
     elif outfmt in ['parse', 'format']:
         intro = '<pre>%s</pre>\n\n' % self.restOutput('intro')
         return intro
     elif outfmt in ['default', 'full']:
         return self.restFullOutput(maxparsesize)
     return 'No %s output generated.' % outfmt
Пример #7
0
 def makePPI(self):  ### Generates files for Human-HIV PPI analysis
     '''Generates files for Human-HIV PPI analysis.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % self.getStr('HIVSeq'),'autoload=T'])
         if not seqlist.seqs(): return False
         seqmap = seqlist.seqNameDic('Max')
         mdb = self.db('HHPIDMap')
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for hivacc in mdb.index('AccHIV'):
             # map HIV accession numbers on to sequences seqNameDic
             accnum = string.split(hivacc,'.')[0]
             hivseq = seqmap[accnum]              
             # extract short HIV name from sequence ID
             hivgene = string.split(hivseq.shortName(),'_')[0].upper()
             # create directory named after HIV gene
             #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene))
             rje.mkDir(self,'%s/' % hivgene,log=True)
             # copy human PPI files into directories, adding HIV gene
             ex = 0.0; etot = len(mdb.index('AccHIV')[hivacc])
             for entry in mdb.indexEntries('AccHIV',hivacc):
                 self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s %s PPI' % (rje.iStr(etot),hivgene))
                 pfile = self.getStr('PPIDir') + entry['Symbol'] + '.ppi.fas'
                 if rje.exists(pfile):
                     FAS = open('%s/%s.%s.ppi.fas' % (hivgene,hivgene.lower(),entry['Symbol']),'w')
                     FAS.write('>%s\n%s\n' % (hivseq.info['Name'],hivseq.getSequence()))
                     FAS.write(open(pfile,'r').read())
                     FAS.close()
                 else: self.errorLog('Cannot find human PPI file for %s interactor "%s"' % (entry['HIV'],entry['Symbol']),printerror=False)
             self.printLog('\r#PPI','Generated human-HIV PPI fasta files for %s %s (%s) PPI.' % (rje.iStr(etot),entry['HIV'],hivgene))                                      
     except: self.errorLog('%s.makePPI error' % self); return False
Пример #8
0
 def loadXRef(self):  ### Load Identifier XRef Data
     '''Load Identifier XRef Data.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if rje.exists('%s.xref.tdt' %
                       self.info['Basefile']) and not self.opt['Force']:
             return self.db().addTable('%s.xref.tdt' %
                                       self.info['Basefile'],
                                       mainkeys=['#'],
                                       datakeys='All',
                                       name='XRef')
         if not rje.checkForFile(self.info['XRef']): return False
         changehead = {
             'Ensembl Gene ID': 'EnsG',
             'Ensembl Protein ID': 'EnsP',
             'Associated Gene Name': 'Gene',
             'Associated Gene DB': 'GeneDB',
             'UniProt/SwissProt ID': 'UniprotID',
             'UniProt/SwissProt Accession': 'UniProt',
             'SGD Gene': 'SGD'
         }
         ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         xref = self.db().addTable(self.info['XRef'],
                                   mainkeys='All',
                                   datakeys='All',
                                   name='XRef')
         for field in changehead:
             if field in xref.fields():
                 xref.renameField(field, changehead[field])
         xref.saveToFile('%s.xref.tdt' % self.info['Basefile'])
         return xref
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Пример #9
0
 def seqinObj(self,
              summarise=True,
              gapstats=True
              ):  ### Returns the a SeqList object for the SeqIn file
     '''
     Returns the a SeqList object for the SeqIn file.
     :return: self.obj['SeqIn']
     '''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqbase = rje.baseFile(self.getStr('SeqIn'), strip_path=True)
         if not self.obj['SeqIn']:
             seqcmd = self.cmd_list
             if summarise: seqcmd += ['summarise=T', 'dna=T', 'raw=F']
             gapstats = gapstats and (
                 self.force() or not rje.exists('%s.gaps.tdt' % seqbase))
             if gapstats: seqcmd += ['gapstats']
             self.obj['SeqIn'] = rje_seqlist.SeqList(
                 self.log,
                 seqcmd + ['autoload=T', 'seqmode=file', 'autofilter=F'])
             # sx = 0.0; stot = self.obj['SeqIn'].seqNum()
             # for seq in self.obj['SeqIn'].seqs():
             #     self.progLog('\r#CHECK','Checking sequences names: %.1f%%' % (sx/stot)); sx += 100.0
             #     if '|' in self.obj['SeqIn'].shortName(seq):
             #         raise ValueError('Pipe "|" characters found in seqin=FILE names: will break program. Please rename and try again.')
             # self.printLog('\r#CHECK','Checking sequences names complete.')
     except ValueError:
         self.printLog('\r#CHECK', 'Checking sequences names aborted.')
         self.errorLog('DepthCharge input sequence error')
         raise
     except:
         self.errorLog('DepthCharge.seqinObj() error')
     return self.obj['SeqIn']
Пример #10
0
    def enrichment(
        self
    ):  ### Performs final enrichment analysis on SLiMDIP and Random datasets.
        '''
        Performs final enrichment analysis on SLiMDIP and Random datasets. This requires the "real" predicted DMI from
        the slimDIP() method plus the randomised PPI datasets (from randomisePPI()). The latter are also run through the
        slimDIP() method to generate a background distribution. This is used directly to calculate enrichment "p-values"
        but also to generate a summary output file that can be used for generating histograms etc. with slimdip.R.

        This method needs:
        - slimdip table (or *.slimdip.tdt output file to load).
        - randbase.XX.tdt files.
        '''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [0a] Check for required data else run preceding step(s) ~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            # Check for SLiMDIP table of real predicted DMI. Load in slimDIP() method if present and not in Database Table.
            if not self.db('slimdip') and not self.slimDIP(): return False
            # Check for randomised PPI datasets. These should be named randbase.XXX.tdt.
            for r in range(self.getInt('RandPPI')):
                randfile = '%s.%s.tdt' % (self.getStr('RandBase'),
                                          rje.preZero(
                                              r,
                                              self.getInt('RandPPI') - 1))
                if not rje.exists(randfile):
                    if not self.randomisePPI(): return False
                    break

            ### ~ [1] Perform Enrichment Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Perform SLiMDIP analysis of each random PPI dataset ~~~~~~~~~~~~~~~~~~~~~~~~~ ##

            return True
        except:
            self.errorLog('%s.enrichment() error' % self.prog())
            return False
Пример #11
0
 def endFork(self, fdict):  ### Ends fork, tidies and sets new one running
     '''Ends fork, tidies and sets new one running.'''
     try:  ### ~ [1] ~ End and tidy current job ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if 'ResFile' in fdict:
             for resfile in fdict['ResFile']:
                 fromfile = '%s.%s' % (fdict['FID'], resfile)
                 if not rje.exists(fromfile):
                     continue  #self.warnLog('Results file %s missing!' % fromfile); continue
                 tofile = '%s.%s' % (self.baseFile(), resfile)
                 if rje.exists(tofile):
                     open(tofile, 'a').writelines(
                         open(fromfile, 'r').readlines()[1:])
                     os.unlink(fromfile)
                 else:
                     rje.fileTransfer(fromfile, tofile)
         if 'Log' in fdict:
             if 'cmd' in fdict:
                 open(self.log.info['LogFile'], 'a').writelines(
                     open(fdict['Log'], 'r').readlines()[5:-1])
                 os.unlink(fdict['Log'])
             else:
                 rje.fileTransfer(fdict['Log'], self.log.info['LogFile'])
             if self.getBool('LogFork'):
                 self.printLog(
                     '#END', 'Fork %s ended: log content transferred' %
                     fdict['PID'])
                 self.printLog('#~~#', '#~~#', timeout=False)
             #if self.dev(): self.deBug(fdict['Log'])
             #if self.dev(): self.deBug(rje.exists(fdict['Log']))
         elif 'PID' in fdict and string.split(
                 '%s' % fdict['PID'])[0] == 'WAIT':
             pass
         else:
             self.printLog('#END',
                           'Fork %s ended.' % fdict['PID'],
                           log=self.getBool('LogFork'),
                           screen=self.getBool('LogFork') or self.v() > 1)
     except IOError:
         if self.getInt('IOError') == 1:
             self.errorLog('Forker.endFork IOError limit reached')
             raise
         else:
             self.int['IOError'] -= 1
             self.errorLog('Forker.endFork')
     except:
         self.errorLog('Forker.endFork error')
     self.nextFork()  # Carry on regardless
Пример #12
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup Database ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
         db = self.db().addEmptyTable('ProDigIS',['AccNum','Protease','PepCount'],['AccNum','Protease'])
         if self.getInt('MinPepLen') > 0: db.addField('MinPepLen')
         if self.getBool('NRPep'): db.addField('NRPep')
         if rje.exists(self.getStr('Source')):
             fdb = self.db().addTable(self.getStr('Source'),mainkeys=['AccNum'],name='Source')
             fdb.addField('File')
             fdb.addField('ProtMWt')
         else: fdb = self.db().addEmptyTable('Source',['AccNum','File','ProtMWt'],['AccNum'])
         for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i)
         if self.getBool('PepMWt'):
             for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i*100.0)
         ### ~ [2] Load Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autoload=F'])
         self.obj['SeqList'].seq = fullseq = []
         for seqfile in self.list['SeqFiles']:
             file = rje.baseFile(seqfile,True)
             seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % seqfile,'autoload=T'])
             fullseq += seqlist.seqs()
             for seq in seqlist.seqs():
                 accnum = seq.getStr('AccNum')
                 try:
                     entry = fdb.data()[accnum]
                     if 'File' in entry and entry['File']: self.errorLog('%s found in %s AND %s!' % (accnum,entry['File'],file),printerror=False)
                     entry['File'] = file
                     entry['ProtMWt'] = seq.MWt()
                 except:
                     entry = {'AccNum':accnum,'File':file,'ProtMWt':seq.MWt()}
                     fdb.addEntry(entry)
                 self.deBug(fdb.dict['Data'][seq.getStr('AccNum')])
         self.printLog('#SEQ','%s sequences to analyse in total' % rje.iLen(fullseq))
         fdb.fillBlanks()
         ### ~ [3] Setup Peptide Probabilities ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self._peptideProbabilities():
             db.addField('LenExp','PepCount');
             if self.getBool('PepMWt'): db.addField('MWtExp','LenExp'); db.addField('Len7Exp','MWtExp')
             else: db.addField('Len7Exp','LenExp')
             db.addField('Len37','Len7Exp')
             if self.getBool('PepMWt'):
                 db.addField('Len5','MWtExp'); db.addField('MWt5','Len5')
                 db.addField('Len3','MWtExp'); db.addField('MWt3','Len3')
             else: db.addField('Len5','LenExp'); db.addField('Len3','LenExp')
         return
         ### ~ [4] Temp GABLAM Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdb = self.db().addTable('Chlam_Pos.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GABLAM')
         ndb = self.db().addTable('Chlam_Neg.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GNeg')
         self.db().mergeTables(gdb,ndb,overwrite=True,matchfields=True)
         gdb.renameField('Qry','AccNum')
         tmp = self.db().joinTables(name='blast',join=[('Source','AccNum'),('GABLAM','AccNum')],newkey=['AccNum','File'],keeptable=False)
         tmp.saveToFile()
         tmp.compress(['File'],default='mean')
         tmp.dropFields(['AccNum'])
         tmp.info['Name'] = 'blastsum'
         tmp.saveToFile()
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Пример #13
0
 def haqBatch(self,force=False): ### Generates Batch and INI files for HAQESAC runs
     '''Generates Batch and INI files for HAQESAC runs.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True)
         inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'],wholepath=True)
         if force or self.force() or not rje.exists(batfile) or not rje.exists(inifile): rje.backup(self,batfile); rje.backup(self,inifile)
         else: return self.printLog('#HAQBAT','HAQESAC Batch files found.')
         ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         haqcmd = []
         for cmd in self.cmd_list:
             if cmd[:4].lower() != 'ini=': haqcmd.append(cmd)
         if self.opt['MultiHAQ']: haqcmd += ['multihaq=T','force=F']
         open(inifile,'w').write(string.join(haqcmd,'\n'))
         ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             acc = seq.info['AccNum']
             haqcmd = ['seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc]
             open(batfile,'a').write('python %shaqesac.py %s\n' % (self.info['Path'],string.join(haqcmd)))
         self.printLog('#HAQBAT','HAQESAC Batch file output to %s' % batfile)
     except: self.errorLog('Major problem with MultiHAQ.haqBatch',quitchoice=True)
Пример #14
0
 def setup(self):  ### Main class setup method.
     '''
     Main class setup method. This will load sequences into a SeqList object, gaps into a 'gaps' database table, and
     check or generate a PAF file from the mapped long reads.
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log, self.cmd_list)
         if not self.getStrLC('SeqIn'):
             raise ValueError('seqin=FILE must be set')
         if not rje.exists(self.getStr('SeqIn')):
             raise IOError('Unable to read seqin=FILE: "{0}"'.format(
                 self.getStr('SeqIn')))
         seqbase = rje.baseFile(self.getStr('SeqIn'), strip_path=True)
         if not self.getStrLC('Basefile'): self.baseFile(seqbase)
         if rje.checkForFiles(filelist=['.gaps.tdt'],
                              basename=seqbase,
                              log=self.log) and not self.force():
             self.cmd_list.append('gapstats=F')
         else:
             self.cmd_list.append('gapstats=T')
         seqin = self.seqinObj()
         gapdb = self.db().addTable('%s.gaps.tdt' % seqbase,
                                    mainkeys=['seqname', 'start', 'end'],
                                    name='gaps',
                                    ignore=[],
                                    expect=True)
         gapdb.dataFormat({'start': 'int', 'end': 'int'})
         ### ~ [2] PAF File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.getStrLC('PAF'):
             self.setStr({'PAF': self.baseFile() + '.paf'})
         pfile = self.getStr('PAF')
         if self.force() or not rje.exists(pfile):
             paf = rje_paf.PAF(self.log, self.cmd_list)
             paf.longreadMinimapPAF(pfile)
         if not rje.exists(self.getStr('PAF')):
             raise IOError(
                 'Unable to read or create PAF file: {0}'.format(pfile))
         return True
     except:
         self.errorLog('Problem during %s setup.' % self.prog())
         return False  # Setup failed
Пример #15
0
 def haqBatch(
         self,
         force=False):  ### Generates Batch and INI files for HAQESAC runs
     '''Generates Batch and INI files for HAQESAC runs.'''
     try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'],
                                wholepath=True)
         inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'],
                                wholepath=True)
         if force or self.force(
         ) or not rje.exists(batfile) or not rje.exists(inifile):
             rje.backup(self, batfile)
             rje.backup(self, inifile)
         else:
             return self.printLog('#HAQBAT', 'HAQESAC Batch files found.')
         ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         haqcmd = []
         for cmd in self.cmd_list:
             if cmd[:4].lower() != 'ini=': haqcmd.append(cmd)
         if self.opt['MultiHAQ']: haqcmd += ['multihaq=T', 'force=F']
         open(inifile, 'w').write(string.join(haqcmd, '\n'))
         ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             acc = seq.info['AccNum']
             haqcmd = [
                 'seqin=%s.fas' % acc,
                 'query=%s' % acc,
                 'basefile=%s' % acc
             ]
             open(batfile,
                  'a').write('python %shaqesac.py %s\n' %
                             (self.info['Path'], string.join(haqcmd)))
         self.printLog('#HAQBAT',
                       'HAQESAC Batch file output to %s' % batfile)
     except:
         self.errorLog('Major problem with MultiHAQ.haqBatch',
                       quitchoice=True)
Пример #16
0
 def makePPI(self):  ### Generates files for Human-HIV PPI analysis
     '''Generates files for Human-HIV PPI analysis.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(
             self.log, self.cmd_list +
             ['seqin=%s' % self.getStr('HIVSeq'), 'autoload=T'])
         if not seqlist.seqs(): return False
         seqmap = seqlist.seqNameDic('Max')
         mdb = self.db('HHPIDMap')
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for hivacc in mdb.index('AccHIV'):
             # map HIV accession numbers on to sequences seqNameDic
             accnum = string.split(hivacc, '.')[0]
             hivseq = seqmap[accnum]
             # extract short HIV name from sequence ID
             hivgene = string.split(hivseq.shortName(), '_')[0].upper()
             # create directory named after HIV gene
             #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene))
             rje.mkDir(self, '%s/' % hivgene, log=True)
             # copy human PPI files into directories, adding HIV gene
             ex = 0.0
             etot = len(mdb.index('AccHIV')[hivacc])
             for entry in mdb.indexEntries('AccHIV', hivacc):
                 self.progLog(
                     '\r#PPI',
                     'Generating human-HIV PPI fasta files for %s %s PPI' %
                     (rje.iStr(etot), hivgene))
                 pfile = self.getStr(
                     'PPIDir') + entry['Symbol'] + '.ppi.fas'
                 if rje.exists(pfile):
                     FAS = open(
                         '%s/%s.%s.ppi.fas' %
                         (hivgene, hivgene.lower(), entry['Symbol']), 'w')
                     FAS.write('>%s\n%s\n' %
                               (hivseq.info['Name'], hivseq.getSequence()))
                     FAS.write(open(pfile, 'r').read())
                     FAS.close()
                 else:
                     self.errorLog(
                         'Cannot find human PPI file for %s interactor "%s"'
                         % (entry['HIV'], entry['Symbol']),
                         printerror=False)
             self.printLog(
                 '\r#PPI',
                 'Generated human-HIV PPI fasta files for %s %s (%s) PPI.' %
                 (rje.iStr(etot), entry['HIV'], hivgene))
     except:
         self.errorLog('%s.makePPI error' % self)
         return False
Пример #17
0
 def loadXRef(self):     ### Load Identifier XRef Data
     '''Load Identifier XRef Data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if rje.exists('%s.xref.tdt' % self.info['Basefile']) and not self.opt['Force']: 
             return self.db().addTable('%s.xref.tdt' % self.info['Basefile'],mainkeys=['#'],datakeys='All',name='XRef')
         if not rje.checkForFile(self.info['XRef']): return False
         changehead = {'Ensembl Gene ID':'EnsG','Ensembl Protein ID':'EnsP','Associated Gene Name':'Gene',
                       'Associated Gene DB':'GeneDB','UniProt/SwissProt ID':'UniprotID',
                       'UniProt/SwissProt Accession':'UniProt','SGD Gene':'SGD'}
         ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         xref = self.db().addTable(self.info['XRef'],mainkeys='All',datakeys='All',name='XRef')
         for field in changehead:
             if field in xref.fields(): xref.renameField(field,changehead[field])
         xref.saveToFile('%s.xref.tdt' % self.info['Basefile']); return xref
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
Пример #18
0
 def depthChargeForker(self):  ### Main DepthCharge forking method
     '''
     Work through each sequence and fork it out for DepthCharge analysis.
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqin = self.seqinObj()
         self.list['ToFork'] = seqin.list['Seq'][0:]
         resfile = '{0}.depthcharge.tdt'.format(self.baseFile())
         if self.force(): rje.backup(resfile, appendable=False)
         elif rje.exists(resfile):
             ddb = self.db().addTable(resfile,
                                      ['seqname', 'start', 'end', 'type'])
             ddb.dataFormat({'start': 'int', 'end': 'int'})
             complete = ddb.indexDataList('type', 'all', 'seqname')
             if complete:
                 cx = 0
                 for seq in self.list['ToFork'][0:]:
                     if seqin.shortName(seq) in complete:
                         self.list['ToFork'].remove(seq)
                         cx += 1
                 if cx:
                     self.printLog(
                         '#SKIP',
                         'Skipping {0} previously processed sequences (force=F)'
                         .format(rje.iStr(cx)))
             if not self.list['ToFork']:
                 self.printLog(
                     '#CHARGE',
                     'All sequences previously processed (force=F)')
                 return ddb
         while len(self.list['Forked']) < self.getNum(
                 'Forks') and self.list['ToFork']:
             self.nextFork()
         ### ~ [2] ~ Work through each sequence and fork out ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.forking()
         self.printLog('#FORK',
                       'Forking of %s jobs completed.' %
                       (rje.iStr(seqin.seqNum())),
                       log=self.getBool('LogFork'))
         ddb = self.db().addTable(resfile,
                                  ['seqname', 'start', 'end', 'type'],
                                  replace=True)
         ddb.dataFormat({'start': 'int', 'end': 'int'})
         return ddb
     except:
         self.errorLog('%s.depthChargeForker error' % self.prog())
Пример #19
0
 def parse(self):    ### Parse REST file into dictionaries
     '''Parse REST file into dictionaries.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['RestKeys'] = []
         rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True))
         if rje.exists(self.getStr('RestIn')): restin = open(self.getStr('RestIn'),'r').read()
         elif rje.matchExp('^(\d+)$',self.getStr('RestIn')):
             url = '%sretrieve&jobid=%s&password=%s' % (self.getStr('RestURL'),self.getStr('RestIn'),self.getStr('Password'))
             if self.getBool('PureAPI') and self.getStrLC('Rest'): url += '&rest=%s' % (self.getStr('Rest'))
             else: url += '&rest=full'
             restin = urllib2.urlopen(url).read()
             if self.getBool('PureAPI'): return restin
         else: raise IOError('%s not found!' % self.getStr('RestIn'))
         jobid = None
         ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for restdata in string.split(restin,'###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###\n'):
             if not jobid:
                 self.dict['Output']['intro'] = restdata
                 prog = rje.matchExp('Output for (\S+)',restdata)[0]
                 self.dict['Output']['prog'] = prog
                 jobid = rje.matchExp('JobID: (\d+)',restdata)[0]
                 self.dict['Output']['jobid'] = jobid
                 if not self.getStrLC('RestBase'): rbase = '%s%s' % (self.getStr('RestOutDir'),jobid)
                 self.dict['Outfile']['jobid'] =  '%s.jobid' % (rbase)
                 continue
             restlines = string.split(restdata,'\n')
             rparse = string.split(restlines.pop(0))
             if rparse[0] != '#': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue
             if rparse[1][-1] != ':': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue
             rkey = rparse[1][:-1]
             try:
                 rfile = '%s.%s' % (rbase,rje.baseFile(rparse[2],strip_path=True,keepext=True))
             except: rfile = ''
             if not rfile: rfile = '%s.%s' % (rbase,rkey)
             rfile = string.replace(rfile,'%s.%s.' % (jobid,jobid),'%s.' % jobid)
             self.dict['Output'][rkey] = string.join(restlines,'\n')
             self.dict['Outfile'][rkey] = rfile
             self.list['RestKeys'].append(rkey)
         self.printLog('#PARSE','Parsed %s: %d REST outputs.' % (self.getStr('RestIn'),len(self.dict['Output'])))
         return True
     except: self.errorLog('%s.parse error' % self); return False
Пример #20
0
 def restOutput(self,outfmt=None,maxparsesize=0,asjson=False):    ### Returns rest output for outfmt
     '''Returns rest output for outfmt.'''
     if not outfmt: outfmt = self.getStrLC('Rest')
     if not outfmt: self.jsonText('No REST output',asjson)
     if outfmt in self.dict['Output']:
         rfile = string.split(self.dict['Output'][outfmt],'\n')[0]
         if rje.exists(rfile):
             fext = string.split(rfile,'.')[-1]
             if fext in ['png']:
                 self.debug(rfile)
                 self.jsonText(rfile,asjson)
             nbytes = os.path.getsize(rfile)
             if nbytes > maxparsesize > 0:   # Too large to parse
                 otext = '%s is too large to return (%s > %s)' % (os.path.basename(rfile),rje.humanByteSize(nbytes),rje.humanByteSize(maxparsesize))
                 try: jobid = self.dict['Output']['jobid']
                 except: jobid = None
                 resturl = '%sretrieve&jobid=%s&rest=%s[&password=X]' % (self.getStr('RestURL'),jobid,outfmt)
                 if not jobid or outfmt == self.getStrLC('Rest'): return self.jsonText('ERROR: %s' % (otext),asjson)
                 else: return self.jsonText('%s in full output. Try %s.' % (otext,resturl),asjson)
             else:
                 delimit = rje.delimitFromExt(filename=rfile,write=False)
                 if asjson and delimit in [',','\t']:
                     jtext = []
                     for rline in open(rfile,'r').readlines():
                         jtext.append(json.dumps(rje.readDelimit(rline,delimit)))
                     return '[%s]' % string.join(jtext,',\n        ')
                 #!# Add json parsing of fasta files?
                 else:
                     outtxt = open(rfile,'r').read()
                     if not outtxt.endswith('\n'): outtxt += '\n'
                     return self.jsonText(outtxt,asjson)
         elif asjson and outfmt in self.dict['Outfile']:
             pass    #!# Sort out json formatting here based on file extension!
         return self.dict['Output'][outfmt]
     elif outfmt in ['parse','format']:
         intro = '<pre>%s</pre>\n\n' % self.restOutput('intro')
         return self.jsonText(intro,asjson)
     elif outfmt in ['default','full']: return self.jsonText(self.restFullOutput(maxparsesize),asjson)
     elif outfmt in ['restkeys','outputs']: return string.join(self.list['RestKeys']+[''],'\n')
     return self.jsonText('No %s output generated.' % outfmt,asjson)
Пример #21
0
 def rmdKnit(self,
             rmdfile,
             document='html',
             stdout=False):  ### Knit Rmd to HTML/PDF file
     '''
     Knit Rmd to HTML/PDF file.
     >> rmdfile:str = R markdown file to knit
     >> document:str ['html'] = type of document to knit into
     << success:bool = whether output is generated
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         outfile = '%s.%s' % (rje.baseFile(rmdfile), document)
         rcmd = 'Rscript -e \'library(rmarkdown); rmarkdown::render("%s", "%s_document")\'' % (
             rmdfile, document)
         self.printLog('#RCMD', rcmd)
         rcmd += ' 2>&1'
         if self.v() < 2 and not stdout: os.popen(rcmd).read()
         else:
             self.progLog('#RCMD', 'Knitting %s...' % (rmdfile))
             os.system(rcmd)
         success = rje.exists(outfile)
         if success:
             self.printLog('#RCMD',
                           '%s generated from %s' % (outfile, rmdfile))
         else:
             self.printLog(
                 '#SYS',
                 'If pandoc error, try setting global variable: export RSTUDIO_PANDOC=/Applications/RStudio.app/Contents/MacOS/pandoc'
             )
             self.printLog(
                 '#SYS',
                 'If no pandoc error, check that required libraries in %s are installed'
                 % rmdfile)
             raise IOError('%s not created' % outfile)
         return True
     except:
         self.errorLog('%s.rmdKnit error: check R installation' %
                       self.prog())
         return False
Пример #22
0
 def exonerate(self,qryfas, genome, model,exonerate='exonerate',bestn=0):
     '''
     Runs exonerate and parses output into lists for processing.
     { query: {'gff':[outputlines], 'cigar':[outputlines], 'alignment':[outputlines], 'vulgar':[[headerlist], {header:value}, {header:value}, ...] }
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         EXFILE = None
         exfile = '%s.%s' % (self.baseFile(),model)  # Used in memsaver mode
         query_dic = {}
         header_list = ['query_id', 'query_start', 'query_end', 'query_strand', 'target_id', 'target_start', 'target_end', 'target_strand', 'score', '<label, query_length, target_length> triplets']
         excmd = [exonerate, qryfas, genome, '--showtargetgff', '--showcigar']
         if model: excmd += ['--model', model]
         if bestn: excmd += ['--bestn', '%d' % bestn]
         if self.getStrLC('ExOpt'): excmd += string.split(self.getStr('ExOpt'))
         self.printLog('#RUN',string.join(excmd))
         extext = []
         if self.getBool('MemSaver'):
             gzfile = '%s.gz' % exfile
             if rje.exists(gzfile): self.gUnzip(gzfile)
             if rje.exists(exfile) and not self.force():
                 self.printLog('#EXFILE','Found %s (force=F). Assuming complete.' % exfile)
             else:
                 rje.backup(self,exfile)
                 self.printLog('#SAVER','memsaver=T: Exonerate output directed to %s.' % exfile)
                 EXFILE = open(exfile,'w')
                 if subprocess.call(excmd, stdout=EXFILE): raise IOError('Exonerate call did not complete!')
                 EXFILE.close()
                 self.printLog('#EXFILE','%s generated.' % exfile)
             EXFILE = open(exfile,'r')
         else:
             extext = Popen(excmd, stdout=PIPE).stdout.readlines()
         output_format = ''
         while extext or EXFILE:
             #line = process.stdout.readline().rstrip()
             if EXFILE:
                 line = EXFILE.readline()
                 if not line: break
                 line = rje.chomp(line)
             else: line = rje.chomp(extext.pop(0))
             if line:
                 if line.startswith('         Query:'):
                     query = line.split(':', 1)[1].split(' ')[1]
                     #for q in rje.sortKeys(query_dic):
                     #    self.bugPrint('%s: %s' % (q,rje.sortKeys(query_dic[q])))
                     #self.debug(query)
                 if line == 'C4 Alignment:':
                     output_format = 'alignment'
                 elif line == '# --- START OF GFF DUMP ---':
                     output_format = 'gff'
                 elif line.startswith('vulgar:'):
                     output_format = 'vulgar'
                     fields = line.split(' ', 10)[1:]
                     if output_format in query_dic[query]:
                         query_dic[query][output_format].append({})
                     else:
                         query_dic[query][output_format] = [header_list, {}]
                     for header, field in zip(header_list, fields):
                         query_dic[query][output_format][-1][header] = field
                     #self.debug(query_dic[query][output_format])
                 elif line.startswith('cigar:'):
                     output_format = 'cigar'
                     if output_format in query_dic[query]:
                         query_dic[query][output_format].append(line.replace('cigar: ', ''))
                     else:
                         query_dic[query][output_format] = [line.replace('cigar: ', '')]
                 elif line == '------------' or line.startswith('Command line:') or line.startswith('Hostname:') or line == '# --- END OF GFF DUMP ---' or line == '#' or line.startswith('-- completed exonerate analysis'):
                     pass
                 elif output_format:
                     if query in query_dic:
                         if output_format in query_dic[query]:
                             query_dic[query][output_format].append(line)
                         else:
                             query_dic[query][output_format] = [line]
                     else:
                         query_dic[query] = {output_format:[line]}
             #elif process.poll() is not None:
             #    break
             elif output_format == 'alignment':
                 try: query_dic[query][output_format].append(line)
                 except: pass
             self.vPrint(line,v=1)
         if EXFILE:
             EXFILE.close()
             if self.getBool('Cleanup'):
                 os.unlink(exfile)
                 self.printLog('#CLEAN','%s deleted.' % exfile)
             elif self.getBool('GZip'): self.gZip(exfile)
         return query_dic
     except: self.errorLog('%s.exonerate error' % self.prog()); raise
Пример #23
0
 def multiHAQ(self,secondrun=False):     ### Executes main HAQESAC runs
     '''Executes main HAQESAC runs.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         finalrun = secondrun == self.opt['MultiHAQ']    # Whether this is the manual HAQESAC phase
         qryacc = self.obj['SeqList'].accList()          # Full list of Query accession numbers
         processed = []                                  # List of processed sequence accession numbers
         ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             acc = seq.info['AccNum']
             if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))):
                 self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue
             ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ##
             logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True)
             infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True)
             pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True)
             pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True)
             if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue
             if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile:
                 self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue
             if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile:
                 self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue
             inseqx = rje_seq.SeqCount(self,infile)
             if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue
             ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0
             while secondrun and self.opt['Chaser'] and not pickled:
                 self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm))
                 time.sleep(60*tm); tm += 1
                 pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile)
                 if not pickled:
                     try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit')
                     except:
                         self.printLog('#PICKLE','No %s pickle.' % (acc,tm))
                         self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return
             ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             runhaqesac = True
             pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True)
             if not self.force() and rje.exists(pngfile):
                 self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile)
                 runhaqesac = False
             ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True)
             if not self.force() and rje.exists(ancfile):
                 self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile)
                 runhaqesac = False
             #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile:
             if runhaqesac:
                 haqcmd = ['ini=haqesac.ini','seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc, 'newlog=F']
                 self.printLog('#HAQ','Running HAQESAC for %s - will have own log etc.' % seq.shortName(),log=False)
                 os.chdir(self.info['HaqDir'])
                 info = haqesac.makeInfo()
                 haqcmd = rje.getCmdList(haqcmd,info=info)
                 out = rje.Out(cmd_list=haqcmd)    # Sets up Out object for controlling output to screen
                 out.printIntro(info)                                # Prints intro text using details from Info object
                 haqlog = rje.setLog(info,out,haqcmd)                 # Sets up Log object for controlling log file output
                 try: haqesac.HAQESAC(log=haqlog, cmd_list=haqcmd).run(setobjects=True)
                 except:
                     os.chdir(self.info['RunPath'])
                     if self.i() >= 0 and rje.yesNo('Problem with %s HAQESAC run. Abort?' % seq.shortName()): raise KeyboardInterrupt
                 os.chdir(self.info['RunPath'])
                 if finalrun: self.printLog('#HAQ','HAQESAC final round run for %s' % seq.shortName())
                 else: self.printLog('#HAQ','HAQESAC first round run for %s' % seq.shortName())
             ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if not self.opt['ScreenQry'] or not finalrun: continue
             qacclist = []
             for qacc in rje_seq.SeqList(self.log,['seqin=%s' % infile,'autoload=T','autofilter=F']).accList():
                 if qacc in qryacc and qacc != acc: qacclist.append(qacc)
                 if qacc in qryacc and qacc not in processed: processed.append(qacc)
             self.printLog('#QRY','%d other queries found in %s: [%s]' % (len(qacclist),infile,string.join(qacclist,'; ')))
             self.printLog('#QRY','%d of %d queries processed' % (len(processed),self.seqNum()))
         ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not finalrun: self.printLog('#MULTI','Executing second round of multiHAQ'); self.multiHAQ(True)
     except: self.errorLog('Major problem with MultiHAQ.multiHAQ',quitchoice=True)
Пример #24
0
    def farmHAQ(self):  ### Uses SLiMFarmer to farm out the HAQESAC runs
        '''Uses SLiMFarmer to farm out the HAQESAC runs.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            batfile = os.path.abspath(rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True))
            self.printLog('#FARM',batfile)
            if not rje.exists(batfile): raise IOError('Cannot find %s' % batfile)
            farmcmd = ['subjobs=%s' % batfile,'farm=batch','qsub=F','i=-1','runpath=%s' % os.path.abspath(self.info['HaqDir'])]
            if self.opt['MultiHAQ']:
                haqfarm = ['First round','Second round']
            else: haqfarm = ['Complete run']

            ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for farmrun in haqfarm:
                self.printLog('#CHDIR','Changing directory for %s farming: %s' % (farmrun,self.info['HaqDir']))
                os.chdir(self.info['HaqDir'])
                farmer = slimfarmer.SLiMFarmer(self.log,self.cmd_list+farmcmd)
                farmer.slimFarm()
                os.chdir(self.info['RunPath'])
                self.printLog('#CHDIR','Changed directory post-farming: %s' % self.info['RunPath'])
                self.printLog('#FARM','HAQESAC %s farming complete.' % farmrun)
            return True

            #!# Add identifying and skipping of partial runs.

            for seq in self.seqs():
                ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                acc = seq.info['AccNum']
                if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))):
                    self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue
                ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ##
                logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True)
                infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True)
                pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True)
                pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True)
                if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue
                if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile:
                    self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue
                if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile:
                    self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue
                inseqx = rje_seq.SeqCount(self,infile)
                if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue
                ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0
                while secondrun and self.opt['Chaser'] and not pickled:
                    self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm))
                    time.sleep(60*tm); tm += 1
                    pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile)
                    if not pickled:
                        try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit')
                        except:
                            self.printLog('#PICKLE','No %s pickle.' % (acc,tm))
                            self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return
                ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                runhaqesac = True
                pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True)
                if not self.force() and rje.exists(pngfile):
                    self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile)
                    runhaqesac = False
                ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True)
                if not self.force() and rje.exists(ancfile):
                    self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile)
                    runhaqesac = False

        except:
            os.chdir(self.info['RunPath'])
            self.errorLog('Major problem with MultiHAQ.farmHAQ',quitchoice=True)
Пример #25
0
    def farmHAQ(self):  ### Uses SLiMFarmer to farm out the HAQESAC runs
        '''Uses SLiMFarmer to farm out the HAQESAC runs.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            batfile = os.path.abspath(
                rje.makePath('%shaqesac.bat' % self.info['HaqDir'],
                             wholepath=True))
            self.printLog('#FARM', batfile)
            if not rje.exists(batfile):
                raise IOError('Cannot find %s' % batfile)
            farmcmd = [
                'subjobs=%s' % batfile, 'farm=batch', 'qsub=F', 'i=-1',
                'runpath=%s' % os.path.abspath(self.info['HaqDir'])
            ]
            if self.opt['MultiHAQ']:
                haqfarm = ['First round', 'Second round']
            else:
                haqfarm = ['Complete run']

            ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for farmrun in haqfarm:
                self.printLog(
                    '#CHDIR', 'Changing directory for %s farming: %s' %
                    (farmrun, self.info['HaqDir']))
                os.chdir(self.info['HaqDir'])
                farmer = slimfarmer.SLiMFarmer(self.log,
                                               self.cmd_list + farmcmd)
                farmer.slimFarm()
                os.chdir(self.info['RunPath'])
                self.printLog(
                    '#CHDIR', 'Changed directory post-farming: %s' %
                    self.info['RunPath'])
                self.printLog('#FARM',
                              'HAQESAC %s farming complete.' % farmrun)
            return True

            #!# Add identifying and skipping of partial runs.

            for seq in self.seqs():
                ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                acc = seq.info['AccNum']
                if finalrun and acc in processed and (
                        self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo(
                            '%s already covered by previous HAQESAC. Skip?' %
                            seq.shortName()))):
                    self.printLog(
                        '#SKIP',
                        '%s already covered by previous HAQESAC: Skipped' %
                        seq.shortName())
                    continue
                ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ##
                logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc),
                                       wholepath=True)
                infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc),
                                      wholepath=True)
                pkfile = rje.makePath('%s%s.pickle' %
                                      (self.info['HaqDir'], acc),
                                      wholepath=True)
                pkzfile = rje.makePath('%s%s.pickle.gz' %
                                       (self.info['HaqDir'], acc),
                                       wholepath=True)
                if not os.path.exists(infile):
                    self.printLog(
                        '#SKIP', '%s input file %s not found: Skipped' %
                        (seq.shortName(), infile))
                    continue
                if not finalrun and not self.opt['Force'] and rje.isYounger(
                        pkzfile, infile) == pkzfile:
                    self.printLog('#SKIP',
                                  '%s run detected: Skipped' % seq.shortName())
                    continue
                if not finalrun and not self.opt['Force'] and rje.isYounger(
                        pkfile, infile) == pkfile:
                    self.printLog('#SKIP',
                                  '%s run detected: Skipped' % seq.shortName())
                    continue
                inseqx = rje_seq.SeqCount(self, infile)
                if inseqx < 2:
                    self.printLog(
                        '#SKIP',
                        'Only one sequence found in %s: Skipped' % (infile))
                    continue
                ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                pickled = os.path.exists(pkfile) or os.path.exists(
                    '%s.gz' % pkfile)
                tm = 0
                while secondrun and self.opt['Chaser'] and not pickled:
                    self.progLog(
                        '#WAIT',
                        'No %s pickle. Sleeping for %d min.' % (acc, tm))
                    time.sleep(60 * tm)
                    tm += 1
                    pickled = os.path.exists(pkfile) or os.path.exists(
                        '%s.gz' % pkfile)
                    if not pickled:
                        try:
                            rje.choice(
                                'Press <ENTER> to try again, or <CTRL+C> to Quit'
                            )
                        except:
                            self.printLog('#PICKLE',
                                          'No %s pickle.' % (acc, tm))
                            self.printLog('\r#MULTI',
                                          'Exiting multiHAQ "Chaser" run.')
                            return
                ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                runhaqesac = True
                pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc),
                                       wholepath=True)
                if not self.force() and rje.exists(pngfile):
                    self.printLog(
                        '#SKIP',
                        'Found evidence of completed run: %s (force=F). Skipping.'
                        % pngfile)
                    runhaqesac = False
                ancfile = rje.makePath('%s%s.anc.fas' %
                                       (self.info['HaqDir'], acc),
                                       wholepath=True)
                if not self.force() and rje.exists(ancfile):
                    self.printLog(
                        '#SKIP',
                        'Found evidence of completed run: %s (force=F). Skipping.'
                        % ancfile)
                    runhaqesac = False

        except:
            os.chdir(self.info['RunPath'])
            self.errorLog('Major problem with MultiHAQ.farmHAQ',
                          quitchoice=True)
Пример #26
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         infile = self.getStr('InFile')
         while not rje.exists(infile):
             infile = rje.choice('File "%s" not found. Input file name? (Blank to quit):' % infile)
             if not infile: return self.printLog('#QUIT','Execution terminated!')
         db = rje_db.Database(self.log,self.cmd_list)
         db.basefile(rje.baseFile(infile))
         sdb = db.addTable(infile,mainkeys='#',delimit='\t',name='SPF.Mod')
         levels = {'Level_1':'k','Level_2':'p','Level_3':'c','Level_4':'o','Level_5':'f','Level_6':'g','Level_7':'s'}
         # k__Bacteria	p__Proteobacteria	c__Alphaproteobacteria	o__Rhodospirillales	f__Rhodospirillaceae	g__	s__	denovo44
         # Unassigned	unclassified	unclassified	unclassified	unclassified	unclassified	unclassified	denovo49
         ### ~ [1] Modify Text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         dupnames = []
         parents = {}    # Parent for each term
         renamed = []
         ex = 0.0; etot = sdb.entryNum()
         for entry in sdb.entries():
             self.progLog('\r#SPF','Modifying SPF content: %.1f%%' % (ex/etot)); ex += 100.0
             taxon = ''
             parent = ''
             #self.debug(entry)
             for lvl in ['Level_1','Level_2','Level_3','Level_4','Level_5','Level_6','Level_7']:
                 entry[lvl] = string.replace(entry[lvl],'unidentified','unclassified')
                 #entry[lvl] = string.replace(entry[lvl],'Incertae_sedis','Incertae_sedis-%s' % levels[lvl])
                 null = '%s__' % levels[lvl]
                 #self.bugPrint(null)
                 #self.bugPrint(entry[lvl])
                 if entry[lvl] in [null,'Unassigned','unclassified','%sunclassified' % null,'%sunidentified' % null,'%sunculturedfungus' % null,'%sIncertae_sedis' % null,'%sunclassified_sp.' % null]:
                     if not taxon or taxon.endswith('unclassified'): entry[lvl] = '%sunclassified' % null
                     #elif taxon.endswith('unassigned)'): entry[lvl] = '%s%s' % (null,taxon[3:])
                     #elif taxon.endswith('unassigned)'): entry[lvl] = '%s(%s;%s-unassigned)' % (null,string.split(taxon,'(')[1][:-1],levels[lvl])
                     elif taxon.endswith('unassigned)'): entry[lvl] = '%s%s;%s-unassigned)' % (null,taxon[3:][:-1],levels[lvl])
                     else: entry[lvl] = '%s%s(%s-unassigned)' % (null,taxon[3:],levels[lvl])
                 if entry[lvl] in parents:
                     #self.debug(parents[entry[lvl]])
                     if parent in parents[entry[lvl]]: entry[lvl] = parents[entry[lvl]][parent]
                     else:
                         self.bugPrint(entry[lvl])
                         self.bugPrint(parents[entry[lvl]])
                         renamed.append(entry[lvl])
                         newtax = '%s%d' % (entry[lvl],renamed.count(entry[lvl]))
                         self.warnLog('%s had multiple parents (%s & %s) -> %s' % (entry[lvl],string.join(parents[entry[lvl]],'|'),parent,newtax))
                         parents[newtax] = {parent:newtax}
                         parents[entry[lvl]][parent] = newtax
                         entry[lvl] = newtax
                         self.deBug(parents[entry[lvl]])
                 elif parent: parents[entry[lvl]] = {parent:entry[lvl]}
                 parent = entry[lvl]
                 if entry[lvl][3:] == taxon[3:]:
                     if (entry[lvl],taxon) not in dupnames: dupnames.append((entry[lvl],taxon))
                 #self.bugPrint(entry[lvl])
                 taxon = entry[lvl]
             #self.debug(entry)
             #self.debug(parents)
         self.printLog('\r#SPF','Modifying SPF content complete.')
         dupnames.sort()
         for (dupA,dupB) in dupnames: self.warnLog('Duplicate taxa names: %s & %s' % (dupA,dupB))
         ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb.saveToFile(savefields=sdb.list['Fields'][1:])
         ### ~ [3] Compress to different taxonomic levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         compress = ['Level_1','Level_2','Level_3','Level_4','Level_5','Level_6','Level_7','#']
         dump = compress.pop(-1)
         rules = {'Observation Ids':'list',dump:'str'}
         sdb.dropField('Observation Ids')
         while compress:
             sdb.compress(compress,rules=rules,default='sum',best=[],joinchar='|')
             #if dump == '#':
             sdb.dropField(dump)
             sdb.saveToFile('%s.SPF.%s.%s.spf' % (rje.baseFile(infile),compress[-1],levels[compress[-1]]))
             dump = compress.pop(-1); rules[dump] = 'list'
         return
     except:
         self.errorLog(self.zen())
         raise   # Delete this if method error not terrible
Пример #27
0
 def multiHAQ(self, secondrun=False):  ### Executes main HAQESAC runs
     '''Executes main HAQESAC runs.'''
     try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         finalrun = secondrun == self.opt[
             'MultiHAQ']  # Whether this is the manual HAQESAC phase
         qryacc = self.obj['SeqList'].accList(
         )  # Full list of Query accession numbers
         processed = []  # List of processed sequence accession numbers
         ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in self.seqs():
             ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             acc = seq.info['AccNum']
             if finalrun and acc in processed and (
                     self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo(
                         '%s already covered by previous HAQESAC. Skip?' %
                         seq.shortName()))):
                 self.printLog(
                     '#SKIP',
                     '%s already covered by previous HAQESAC: Skipped' %
                     seq.shortName())
                 continue
             ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ##
             logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc),
                                    wholepath=True)
             infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc),
                                   wholepath=True)
             pkfile = rje.makePath('%s%s.pickle' %
                                   (self.info['HaqDir'], acc),
                                   wholepath=True)
             pkzfile = rje.makePath('%s%s.pickle.gz' %
                                    (self.info['HaqDir'], acc),
                                    wholepath=True)
             if not os.path.exists(infile):
                 self.printLog(
                     '#SKIP', '%s input file %s not found: Skipped' %
                     (seq.shortName(), infile))
                 continue
             if not finalrun and not self.opt['Force'] and rje.isYounger(
                     pkzfile, infile) == pkzfile:
                 self.printLog('#SKIP',
                               '%s run detected: Skipped' % seq.shortName())
                 continue
             if not finalrun and not self.opt['Force'] and rje.isYounger(
                     pkfile, infile) == pkfile:
                 self.printLog('#SKIP',
                               '%s run detected: Skipped' % seq.shortName())
                 continue
             inseqx = rje_seq.SeqCount(self, infile)
             if inseqx < 2:
                 self.printLog(
                     '#SKIP',
                     'Only one sequence found in %s: Skipped' % (infile))
                 continue
             ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             pickled = os.path.exists(pkfile) or os.path.exists(
                 '%s.gz' % pkfile)
             tm = 0
             while secondrun and self.opt['Chaser'] and not pickled:
                 self.progLog(
                     '#WAIT',
                     'No %s pickle. Sleeping for %d min.' % (acc, tm))
                 time.sleep(60 * tm)
                 tm += 1
                 pickled = os.path.exists(pkfile) or os.path.exists(
                     '%s.gz' % pkfile)
                 if not pickled:
                     try:
                         rje.choice(
                             'Press <ENTER> to try again, or <CTRL+C> to Quit'
                         )
                     except:
                         self.printLog('#PICKLE',
                                       'No %s pickle.' % (acc, tm))
                         self.printLog('\r#MULTI',
                                       'Exiting multiHAQ "Chaser" run.')
                         return
             ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             runhaqesac = True
             pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc),
                                    wholepath=True)
             if not self.force() and rje.exists(pngfile):
                 self.printLog(
                     '#SKIP',
                     'Found evidence of completed run: %s (force=F). Skipping.'
                     % pngfile)
                 runhaqesac = False
             ancfile = rje.makePath('%s%s.anc.fas' %
                                    (self.info['HaqDir'], acc),
                                    wholepath=True)
             if not self.force() and rje.exists(ancfile):
                 self.printLog(
                     '#SKIP',
                     'Found evidence of completed run: %s (force=F). Skipping.'
                     % ancfile)
                 runhaqesac = False
             #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile:
             if runhaqesac:
                 haqcmd = [
                     'ini=haqesac.ini',
                     'seqin=%s.fas' % acc,
                     'query=%s' % acc,
                     'basefile=%s' % acc, 'newlog=F'
                 ]
                 self.printLog(
                     '#HAQ',
                     'Running HAQESAC for %s - will have own log etc.' %
                     seq.shortName(),
                     log=False)
                 os.chdir(self.info['HaqDir'])
                 info = haqesac.makeInfo()
                 haqcmd = rje.getCmdList(haqcmd, info=info)
                 out = rje.Out(
                     cmd_list=haqcmd
                 )  # Sets up Out object for controlling output to screen
                 out.printIntro(
                     info
                 )  # Prints intro text using details from Info object
                 haqlog = rje.setLog(
                     info, out, haqcmd
                 )  # Sets up Log object for controlling log file output
                 try:
                     haqesac.HAQESAC(log=haqlog,
                                     cmd_list=haqcmd).run(setobjects=True)
                 except:
                     os.chdir(self.info['RunPath'])
                     if self.i() >= 0 and rje.yesNo(
                             'Problem with %s HAQESAC run. Abort?' %
                             seq.shortName()):
                         raise KeyboardInterrupt
                 os.chdir(self.info['RunPath'])
                 if finalrun:
                     self.printLog(
                         '#HAQ',
                         'HAQESAC final round run for %s' % seq.shortName())
                 else:
                     self.printLog(
                         '#HAQ',
                         'HAQESAC first round run for %s' % seq.shortName())
             ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if not self.opt['ScreenQry'] or not finalrun: continue
             qacclist = []
             for qacc in rje_seq.SeqList(
                     self.log,
                 ['seqin=%s' % infile, 'autoload=T', 'autofilter=F'
                  ]).accList():
                 if qacc in qryacc and qacc != acc: qacclist.append(qacc)
                 if qacc in qryacc and qacc not in processed:
                     processed.append(qacc)
             self.printLog(
                 '#QRY', '%d other queries found in %s: [%s]' %
                 (len(qacclist), infile, string.join(qacclist, '; ')))
             self.printLog(
                 '#QRY', '%d of %d queries processed' %
                 (len(processed), self.seqNum()))
         ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not finalrun:
             self.printLog('#MULTI', 'Executing second round of multiHAQ')
             self.multiHAQ(True)
     except:
         self.errorLog('Major problem with MultiHAQ.multiHAQ',
                       quitchoice=True)
Пример #28
0
    def qsub(self):  ### Creates job and calls with qsub
        '''Creates job and calls with qsub. Returns qsub job ID or 0 if jobwait=True and job completed.'''
        try:  ### Basics ###
            hr = int(self.stat['Walltime'])
            min = int((0.5 + (self.stat['Walltime'] - hr) * 60.0))
            if self.opt['Report']: return self.report()
            jobstr = string.replace('%s.job' % self.info['Job'], '.job', '')
            jlist = [
                '#!/bin/bash',
                '#PBS -N %s' % jobstr,  #,'#PBS -q batch',
                '#PBS -l nodes=%d:ppn=%d' %
                (self.stat['Nodes'], self.stat['PPN']),
                '#PBS -l walltime=%d:%s:00' % (hr, rje.preZero(min, 60)),
                '#PBS -l vmem=%dgb' % self.getInt('VMem'),
                '#PBS -l mem=%dgb' % self.getInt('VMem'),
                ''
            ]  #10
            #if not os.popen('hostname').read().startswith('katana.science.unsw.edu.au'):
            #    jlist[-2] = '#PBS -l mem=%dgb' % self.getInt('VMem')
            if self.getBool('Monitor'):
                if self.getBool('JobWait'):
                    self.warnLog(
                        'Cannot run with wait=T and monitor=T: switched monitor=F'
                    )
                    self.setBool({'Monitor': False})
                else:
                    jlist += ['#PBS -k oed']
            if self.getStr('Email'):
                jlist += ['#PBS -M %s' % self.getStr('Email'), '#PBS -m ae']
                if self.getBool('MailStart'): jlist[-1] = '#PBS -m bae'
            jlist += [
                '### Define number of processors',
                'NPROCS=`wc -l < $PBS_NODEFILE`',
                'echo Running on host `hostname`',
                'echo Time is `date`',
                'echo Directory is `pwd`',  #2
                'echo This jobs runs on the following processors:',
                'echo `cat $PBS_NODEFILE`',
                '',  #5
                'echo This job has allocated $NPROCS cpus',
                ''
            ]
            self.printLog(
                '#PPN', '%d Node(s) requested: %d PPN.' %
                (self.getInt('Nodes'), self.getInt('PPN')))
            self.printLog('#VMEM',
                          '%s GB VMem requested.' % (self.getStat('VMem')))
            if self.getBool('ModPurge'):
                jlist.append('module purge')
                self.printLog('#MOD', 'Modules purged (modpurge=T)')
            for mod in self.list['Modules']:
                if mod.lower() not in ['', 'none']:
                    jlist.append('module add %s' % mod)
            if self.list['Modules']:
                self.printLog(
                    '#MOD', 'Modules added: %s' %
                    string.join(self.list['Modules'], '; '))
            for pcall in self.list['PreCall']:
                self.printLog('#PCALL', pcall)
                jlist.append(pcall)
            #x#jlist = ['#!/bin/sh']   # New Iridis shell script method!
            ### Directory & Program ###
            jlist.append('cd %s' % self.info['QPath'])
            pcall = self.info['Program']
            if self.opt['RjePy']:
                pcall = 'python ' + self.info['PyPath'] + pcall
            jlist.append(pcall)
            ### Completion message
            jlist += ['', 'echo ---', 'qstat -f $PBS_JOBID', 'echo ---']
            jlist += ['', 'echo', 'echo Time is `date`', 'echo Job complete']
            ### Output and call ###
            job = '{0}.job'.format(
                jobstr
            )  #string.replace('%s.job' % self.info['Job'],'.job.job','.job')
            open(job, 'w').write(string.join(jlist, '\n'))
            self.printLog('#DIR', self.info['QPath'])
            self.printLog('#RUN', pcall)
            #qsub = 'qsub %s -S /bin/sh -l walltime=%d:%d:00,nodes=%d:ppn=2' % (job,hr,min,self.stat['Nodes'])
            qsub = 'qsub'
            if self.getBool('StartBash'): qsub += ' -S /bin/bash'
            if self.list['Depend']:
                qsub += ' -W depend=afterany'
                #for id in self.list['Depend']: qsub += ':%s.bio-server' % id
                myhost = self.getStr('DependHPC')
                if not self.getStrLC('DependHPC'):
                    myhost = string.split(os.popen('hostname').read())[0]
                for id in self.list['Depend']:
                    qsub += ':%s.%s' % (id, myhost)
            qsub += ' %s' % (job)
            self.printLog('#JOB', qsub)
            if self.test():
                self.printLog('#TEST',
                              'Test mode: will not place job in queue.')
                self.verbose(
                    0, 1, string.join(['>>>>>'] + jlist + ['<<<<<', ''], '\n'))
                return False
            qrun = os.popen(qsub).read()
            self.printLog('#QSUB', qrun)
            qid = string.split(qrun, '.')[0]
            showstart = 'qstat -T'
            if os.popen('hostname').read().startswith(
                    'katana.science.unsw.edu.au'):
                showstart = 'showstart'
            self.printLog('#SHOW',
                          'Attempt %s %s in %s sec' %
                          (showstart, qrun, self.stat['Pause']),
                          log=False)
            time.sleep(self.stat['Pause'])
            for qline in os.popen('%s %s' % (showstart, qrun)):  #qid):
                if rje.chomp(qline):
                    self.printLog('#INFO', qline, timeout=False)

            ### Wait for job to be completed
            if self.getBool('JobWait'):
                if self.getBool('Monitor'):
                    raise ValueError('Cannot run with wait=T and monitor=T')
                self.printLog('#WAIT',
                              'Waiting for job {0} to finish'.format(qid))
                ofile = '{0}.o{1}'.format(
                    string.replace('%s.job' % self.info['Job'], '.job', ''),
                    qid)
                running = False
                while not rje.exists(ofile):
                    qstat = string.atoi(
                        os.popen("qstat | grep '^{0}' -c".format(
                            qid)).read().split()[0])
                    if not qstat:
                        self.printLog(
                            '#QSTAT',
                            'Job {0} disappeared from qstat'.format(qid))
                        break
                    elif not running:
                        try:
                            qstat = string.split(
                                os.popen("qstat | grep '^{0}'".format(
                                    qid)).read().split()[4])
                            if qstat == 'R':
                                running = True
                                self.printLog('#QSTAT',
                                              'Job {0} running...'.format(qid))
                        except:
                            pass
                    time.sleep(max(1, self.getInt('Pause')))
                owait = 300
                while owait and not rje.exists(ofile):
                    owait -= 1
                    time.sleep(1)
                if rje.exists(ofile):
                    if 'Job complete' in os.popen(
                            'tail -n 1 {0}'.format(ofile)).read():
                        self.printLog(
                            '#DONE',
                            '{0} job ({1}) complete.'.format(jobstr, qid))
                        return 0
                    else:
                        self.printLog(
                            '#FAIL', '{0} job ({1}) failed to finish.'.format(
                                jobstr, qid))
                        return qid
                else:
                    self.printLog(
                        '#FAIL',
                        '{0} job ({1}) failed to generate {2}.'.format(
                            jobstr, qid, ofile))

            return qid
        except:
            self.errorLog('Error in qsub()')
            return False
Пример #29
0
 def _positiveAndNegativePeptides(self): ### Populates PosPep and NegPep Lists
     '''Populates PosPep and NegPep Lists.'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pfile = '%s.peptides.tdt' % self.basefile()
         #if rje.exists(pfile) and not self.getBool('Force'):
         #    try:
         #        pdb = self.db().addTable(pfile,['Peptide'],name='Peptides')
         #        pdb.dataFormat(reformat={'Len':'int','MWt':'num','Cys':'int','Ser':'int','Hyd':'num'})
         #        self.list['Peptides'] = self.list['PosPep'] = pdb.index('Pos')['Y']
         #        self.list['NegPep'] = pdb.index('Positive')['Neg']
         #        return pdb
         #    except: pass
         if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False
         self.list['Peptides'] = peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True)
         seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T'])
         pdb = self.db().addEmptyTable('Peptides',['Peptide','NR','Pos','Len','MWt','C','HPW','DENQ','M','Hyd'],['Peptide'])
         ### ~ [1] ~ Digest Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         protease = self.getStr('PepCut')
         self.list['PosPep'] = poslist = []; self.list['NegPep'] = neglist = []; sx = 0.0; stot = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             frag = string.split(sequence,':')
             while '' in frag: frag.remove('')
             if not self.getBool('NTerm'): frag = frag[1:]
             for pep in frag[0:]:
                 if pep not in poslist: poslist.append(pep)
         self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist)))
         ## ~ [1b] ~ Peptide Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         allpep = []; self.list['Redundant'] = redundant = []
         sx = 0.0; stot = self.obj['SeqList'].seqNum() 
         for seq in self.obj['SeqList'].seqs():
             self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]:
                 sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             for frag in string.split(sequence,':'):
                 if frag in allpep: redundant.append(frag)
                 else: allpep.append(frag)
         self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (protease,rje.iStr(stot)))   
         ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         px = 0.0; ptot = len(poslist)
         for pep in poslist[0:]:
             self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0
             entry = {'Peptide':pep,'MWt':rje_sequence.MWt(pep),'Hyd':rje_sequence.eisenbergHydropathy(pep,returnlist=False),
                      'Len':len(pep),'NR':'Y','Pos':'Y'}
             if pep not in peplist: poslist.remove(pep); neglist.append(pep); entry['Pos'] = 'N'
             if pep in redundant: entry['NR'] = 'N'
             for aacomb in ['C','HPW','DENQ','M']:
                 x = 0
                 for a in aacomb: x += pep.count(a)
                 entry[aacomb] = x
             pdb.addEntry(entry)
         self.printLog('\r#PEP','Processing positive peptides (%s) complete: %s Pos; %s Neg.' % (protease,rje.iLen(poslist),rje.iLen(neglist)))
         ### ~ [2] ~ Save Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pdb.saveToFile(pfile)
         POS = open('%s.positives.fas' % self.basefile(),'w'); NEG = open('%s.negatives.fas' % self.basefile(),'w')
         for pep in poslist: POS.write('>%s\n%s\n' % (pep,pep))
         for pep in neglist: NEG.write('>%s\n%s\n' % (pep,pep))
         POS.close(); self.printLog('#FAS','%s peptides output to %s.positives.fas' % (rje.iLen(poslist),self.basefile()))
         NEG.close(); self.printLog('#FAS','%s peptides output to %s.negatives.fas' % (rje.iLen(neglist),self.basefile()))
         return pdb
     except: self.errorLog('Problem during %s._positiveAndNegativePeptides().' % self); return None  # Setup failed
Пример #30
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
         seqcmd = self.cmd_list + ['autoload=T','seqmode=file','seqindex=T']
         dfile = '%s.data.tdt' % self.basefile()
         ### ~ [2] Load Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.dict['SeqList']['full'] = rje_seqlist.SeqList(self.log,seqcmd + ['seqmode=list'])
         self.debug(self.dict['SeqList']['full'].seqNum())
         if self.dict['SeqList']['full'].seqNum(): return
         self.dict['SeqList']['full'] = rje_seqlist.SeqList(self.log,seqcmd + ['seqin=%s.full.fas' % (self.basefile()),'seqmode=list'])
         for stype in ['CDS','gene','prot']:
             seq = self.dict['SeqList'][stype] = rje_seqlist.SeqList(self.log,seqcmd + ['seqin=%s.%s.fas' % (self.basefile(),stype)])
             seq.dict['SeqDict'] = {}
             for s in seq.list['Seq']:
                 (name,sequence) = seq.getSeq(s)
                 seq.dict['SeqDict'][string.split(string.split(name)[0],'_')[-1]] = s
         ### ~ [3] Database Compilation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if rje.exists(dfile) and not self.getBool('Force'): db.addTable(dfile,name='data',mainkeys=['tag'])
         else:
             ## ~ [3a] ~ Load part tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             fdb = db.addTable('%s.function.tdt' % self.basefile(),name='function',mainkeys=['tag'])
             fdb.dropField('description')
             edb = db.addTable('%s.expression.tdt' % self.basefile(),name='expression',mainkeys=['key'])
             nx = 0
             edb.fillBlanks(blank='0',fillempty=True)
             for ekey in rje.sortKeys(edb.data()):
                 entry = edb.data(ekey)
                 for field in edb.fields():
                     if entry[field] == 'na': entry[field] = '0.0'; nx += 1
             self.printLog('#TDT','Updated %s entries for expression table' % rje.iStr(nx))
             kdb = db.addTable('%s.proteinkey.tdt' % self.basefile(),name='proteinkey',mainkeys=['key'])
             xdb = db.addTable('%s.dbxref.tdt' % self.basefile(),name='dbxref',mainkeys=['tag'])
             xdb.dropField('gene')   # Pull from genbank instead
             #pdb = db.addTable('%s.cysweight.tdt' % self.basefile(),name='cysweight',mainkeys=['AccNum'])
             pdb = db.addTable('%s.protein.tdt' % self.basefile(),name='prodigis',mainkeys=['AccNum'])
             pdb.addField('NRPep5','NRPep',0); pdb.addField('NRPep7','NRPep5',0)
             for x in range(5,51):
                 xfield = '%d' % x
                 if xfield not in pdb.fields(): continue
                 for entry in pdb.entries():
                     entry['NRPep5'] += int(entry[xfield])
                     if x >= 7: entry['NRPep7'] += int(entry[xfield])
             for field in pdb.fields()[0:]:
                 if field not in ['AccNum','File','ProtMWt','PepCount','LenExp','Len3','Len5','Len7Exp','Len37','NRPep','NRPep5','NRPep7','Cys0']: pdb.dropField(field)
             #pdb.renameField('AccNum','uniprot')
             #pdb.newKey(['uniprot'])
             pdb.renameField('AccNum','tag')
             pdb.newKey(['tag'])
             mdb = db.addTable('%s.PNASmaintable.tdt' % self.basefile(),name='main',mainkeys=['tag'])
             tdb = db.addTable('%s.tmhmm.tdt' % self.basefile(),name='TMHMM',mainkeys=['acc_num'])
             ## ~ [3b] ~ Load and process features table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             gdb = db.addTable('%s.Feature.tdt' % self.basefile(),name='feature',mainkeys=['locus','feature','position'])
             gdb.dropEntriesDirect('feature',['CDS'],inverse=True)
             gdb.list['Fields'] += ['tag','start','end','gene','product']
             for entry in gdb.entries():
                 pos = rje.matchExp('(\d+)\.\.(\d+)',entry['position'])
                 if entry['position'][:4] == 'comp': entry['start'] = pos[1]; entry['end'] = pos[0]
                 else: entry['start'] = pos[0]; entry['end'] = pos[1]
                 try: entry['tag'] = rje.matchExp('locus_tag="(\S+)"',entry['details'])[0]
                 except: entry['tag'] = '-'
                 try: entry['gene'] = rje.matchExp('gene="(\S+)"',entry['details'])[0]
                 except: entry['gene'] = ''
                 try: entry['product'] = string.split(string.split(entry['details'],'/product="')[1],'"')[0]
                 except: entry['product'] = ''
             gdb.dropEntriesDirect('tag',['-'])
             gdb.newKey(['tag'])
             for field in ['locus','feature','position','details']: gdb.dropField(field)
             ## ~ [3c] ~ Codon Bias Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             cfile = '%s.CDS.Bias.tdt' % self.basefile()
             if not rje.exists(cfile) or self.getBool('Force'):
                 rje_codons.Codons(self.log,self.cmd_list+['seqin=%s.CDS.fas' % self.basefile(),'backups=F']).run()
             bdb = db.addTable(cfile,name='Bias',mainkeys=['Seq'])
             bdb.renameField('Len','AALen')
             ndb = db.addTable('%s.CDS.NT.tdt' % self.basefile(),name='NT',mainkeys=['Seq'])
             ndb.renameField('Len','NTLen')
             for field in ndb.fields():
                 if field != string.replace(field,'U','T'): ndb.renameField(field,string.replace(field,'U','T'))
             ## ~ [3d] ~ Join tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             temp = db.joinTables(name='temp',join=[(edb,'key'),(kdb,'key')],newkey=['key'],cleanup=True,keeptable=True)
             #pfields = pdb.fields()[0:]
             #pfields.remove('uniprot')
             #temp2 = db.joinTables(name='temp2',join=[(xdb,'uniprot'),(pdb,'uniprot',pfields)],newkey=['tag'],cleanup=True,keeptable=True)
             #data = db.joinTables(name='data',join=[(temp2,'tag'),(fdb,'tag'),(gdb,'tag'),(bdb,'Seq'),(ndb,'Seq'),(temp,'tag'),(mdb,'tag')],newkey=['tag'],cleanup=True,keeptable=True)
             data = db.joinTables(name='data',join=[(pdb,'tag'),(xdb,'tag'),(fdb,'tag'),(tdb,'acc_num'),(gdb,'tag'),(bdb,'Seq'),(ndb,'Seq'),(temp,'tag'),(mdb,'tag')],newkey=['tag'],cleanup=True,keeptable=True)
             data.dropField('Seq')
             ## ~ [3e] ~ Fill out data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             data.fillBlanks(blank='0.0',fields=['eb','rb'],fillempty=True)
             #for entry in data.entries():
             #    if entry['tag'] not in self.dict['SeqList']['CDS'].dict['SeqDict']: entry['function'] = 'Non-CDS'
             data.fillBlanks(blank='Unassigned',fields=['function'],fillempty=True)
             data.fillBlanks()
             data.fillBlanks(blank='no mapping',fields=['description'],fillempty=True)
             data.saveToFile(dfile)
             allfields = data.list['Fields'][0:]
             data.list['Fields'] = ["tag","File","PepCount","LenExp","Len3","Len5","Len7Exp","Len37","NRPep",'NRPep5','NRPep7',"Cys0",
                                    "pi","mass","function","new_function","tm","start","end","AALen","Bias",
                                    "WtBias","AbsBias",'NTLen','C','A','G','T','C|3','A|3','G|3','T|3',
                                    'eb_1.1','eb_1.2','eb_2.1','eb_2.2','rb_1.1','rb_1.2','rb_2.1','rb_2.2','eb','rb']
             data.saveToFile('%s.cutdata.tdt' % self.basefile())
             data.list['Fields'] = allfields
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Пример #31
0
 def sgd2sp(
     self
 ):  ### Reformats yeast sequence names and outputs new data for GOPHER
     '''Reformats yeast sequence names and outputs new data for GOPHER.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         inseq = self.obj['SeqList']
         uni = rje_uniprot.UniProt(self.log,
                                   self.cmd_list + ['datout=None'])
         xref = self.db('XRef')
         self.dict['Rename'] = {}
         ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         ufile = '%s.dat' % self.info['Basefile']
         if os.path.exists(ufile) and not self.opt['Force']:
             uni.readUniProt(ufile, clear=True, cleardata=False)
         else:
             uni.readUniProt(clear=True,
                             acclist=rje.sortKeys(xref.index('UniProt')),
                             cleardata=False)
             uni.saveUniProt(ufile)
         ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniseq = {}
         for entry in uni.entries():
             seq = entry.obj['Sequence']
             uniseq[seq.info['AccNum']] = seq
         self.printLog(
             '\r#USEQ',
             '%s UniProt Sequences extracted (%s Ensembl AccNum)' %
             (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt')))))
         ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         yseq = []  # List of YEAST sequence objects
         (sx, stot) = (0.0, inseq.seqNum())
         for seq in inseq.seqs():
             self.progLog(
                 '\r#SEQ',
                 'Reformatting sequence names: %.2f%%' % (sx / stot))
             sx += 100.0
             if seq.info['SpecCode'] != 'YEAST': continue
             yseq.append(seq)
             sgd = seq.info['AccNum']
             newname = seq.info['Name']
             try:
                 for x in xref.indexEntries('EnsG', sgd):
                     acc = x['UniProt']
                     if acc:
                         newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % (
                             seq.info['Name'], x['Gene'], x['EnsG'],
                             x['SGD'], acc)
                     else:
                         newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % (
                             seq.info['Name'], x['Gene'], x['EnsG'],
                             x['SGD'])
                         continue
                     if acc not in uniseq:
                         self.printLog(
                             '\r#UNIERR',
                             'Unable to find UniProt sequence %s (%s)' %
                             (acc, sgd))
                         continue
                     useq = uniseq[acc]
                     if useq.info['Sequence'] != seq.info['Sequence']:
                         self.printLog(
                             '\r#SEQERR',
                             '%s sequence <> %s sequence' % (sgd, acc))
                         continue
                     nsplit = string.split(newname)
                     nsplit[0] = '%s__%s' % (x['UniprotID'], acc)
                     newname = string.join(nsplit)
                     self.dict['Rename'][sgd] = acc
                     break
             except:
                 self.errorLog('%s problem' % sgd)
             seq.info['Name'] = newname
             seq.extractDetails(gnspacc=True)
         self.printLog('\r#SEQ', 'Reformatting sequence names complete.')
         ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not rje.exists('%s.ygob.fas' % self.info['Basefile']):
             inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile'])
         if not rje.exists('%s.yeast.fas' % self.info['Basefile']):
             inseq.saveFasta(seqs=yseq,
                             seqfile='%s.yeast.fas' % self.info['Basefile'])
         self.list['YeastSeq'] = inseq.accList(yseq)
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Пример #32
0
 def _peptideProbabilities(self):    ### Read in peptides and positives and calculate probability of return
     '''Read in peptides and positives and calculate probability of return.'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getBool('CysWeight'): return self._cysteinePeptideProbabilities()
         self._positiveAndNegativePeptides()
         #return self.printLog('#NOPROB','Probability calculation temporarily suspended')
         pfile = '%s.pep_prob.tdt' % self.basefile()
         if rje.exists(pfile) and not self.getBool('Force'):
             try:
                 pdb = self.db().addTable(pfile,['PepSize'],name='PepProb')
                 pdb.dataFormat(reformat={'PepSize':'num','Positive':'int','Negative':'int','Prob':'num'})
                 for entry in pdb.entries():
                     if entry['PepSize'] < 100: entry['PepSize'] = int(entry['PepSize'])
                 return pdb
             except: pass
         pdb = self.db().addEmptyTable('PepProb',['PepSize','Positive','Negative','Prob'],['PepSize'])
         if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False
         ## ~ [0a] ~ Load Peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True)
         ## ~ [0b] ~ Load Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T'])
         ### ~ [1] ~ Digest Positives and Update PepProb Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         protease = self.getStr('PepCut')
         ## ~ [1a] ~ Create new database entry to fill with data ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         edict = {}
         for i in range(1,self.getInt('MaxPepLen')+1):
             edict[i] = pdb.addEntry({'PepSize':i,'Positive':0,'Negative':0,'Prob':1.0})
             if self.getBool('PepMWt'): edict[i*100.0] = pdb.addEntry({'PepSize':i*100.0,'Positive':0,'Negative':0,'Prob':1.0})
         ## ~ [1b] ~ For each recognition site of each protease, mark cuts with ":" ~~~~~~~~ ##
         poslist = []; neglist = []; sx = 0.0; stot = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             frag = string.split(sequence,':')
             while '' in frag: frag.remove('')
             if not self.getBool('NTerm'): frag = frag[1:]
             for pep in frag[0:]:
                 if self.getBool('NRPep') and pep in self.list['Redundant']: continue
                 if pep not in poslist: poslist.append(pep)
         self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist)))
         ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         px = 0.0; ptot = len(poslist)
         for pep in poslist[0:]:
             self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0
             plen = min(len(pep),self.getInt('MaxPepLen'))
             if pep in peplist: edict[plen]['Positive'] += 1
             else: edict[plen]['Negative'] += 1; poslist.remove(pep); neglist.append(pep)
             if self.getBool('PepMWt'):
                 pwt = 100.0 * min(int((rje_sequence.MWt(pep)+99)/100.0),self.getInt('MaxPepLen'))
                 if pep in peplist: edict[pwt]['Positive'] += 1
                 else: edict[pwt]['Negative'] += 1
         self.printLog('\r#PEP','Processing positive peptides (%s) complete.' % protease)
         ## ~ [1d] # Calculate peptide probabilities for protease combo ~~~~~~~~~~~~~~~~~~~~ ##
         for entry in edict.values():
             try: entry['Prob'] = float(entry['Positive']) / float(entry['Positive']+entry['Negative'])
             except: entry['Prob'] = 0.0
         ### ~ [2] ~ Save File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pdb.saveToFile(pfile)
         return pdb
     except: self.errorLog('Problem during %s._peptideProbabilities().' % self); return None  # Setup failed
Пример #33
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         infile = self.getStr('InFile')
         while not rje.exists(infile):
             infile = rje.choice(
                 'File "%s" not found. Input file name? (Blank to quit):' %
                 infile)
             if not infile:
                 return self.printLog('#QUIT', 'Execution terminated!')
         db = rje_db.Database(self.log, self.cmd_list)
         db.basefile(rje.baseFile(infile))
         sdb = db.addTable(infile,
                           mainkeys='#',
                           delimit='\t',
                           name='SPF.Mod')
         levels = {
             'Level_1': 'k',
             'Level_2': 'p',
             'Level_3': 'c',
             'Level_4': 'o',
             'Level_5': 'f',
             'Level_6': 'g',
             'Level_7': 's'
         }
         # k__Bacteria	p__Proteobacteria	c__Alphaproteobacteria	o__Rhodospirillales	f__Rhodospirillaceae	g__	s__	denovo44
         # Unassigned	unclassified	unclassified	unclassified	unclassified	unclassified	unclassified	denovo49
         ### ~ [1] Modify Text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         dupnames = []
         parents = {}  # Parent for each term
         renamed = []
         ex = 0.0
         etot = sdb.entryNum()
         for entry in sdb.entries():
             self.progLog('\r#SPF',
                          'Modifying SPF content: %.1f%%' % (ex / etot))
             ex += 100.0
             taxon = ''
             parent = ''
             #self.debug(entry)
             for lvl in [
                     'Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_5',
                     'Level_6', 'Level_7'
             ]:
                 entry[lvl] = string.replace(entry[lvl], 'unidentified',
                                             'unclassified')
                 #entry[lvl] = string.replace(entry[lvl],'Incertae_sedis','Incertae_sedis-%s' % levels[lvl])
                 null = '%s__' % levels[lvl]
                 #self.bugPrint(null)
                 #self.bugPrint(entry[lvl])
                 if entry[lvl] in [
                         null, 'Unassigned', 'unclassified',
                         '%sunclassified' % null,
                         '%sunidentified' % null,
                         '%sunculturedfungus' % null,
                         '%sIncertae_sedis' % null,
                         '%sunclassified_sp.' % null
                 ]:
                     if not taxon or taxon.endswith('unclassified'):
                         entry[lvl] = '%sunclassified' % null
                         #elif taxon.endswith('unassigned)'): entry[lvl] = '%s%s' % (null,taxon[3:])
                         #elif taxon.endswith('unassigned)'): entry[lvl] = '%s(%s;%s-unassigned)' % (null,string.split(taxon,'(')[1][:-1],levels[lvl])
                     elif taxon.endswith('unassigned)'):
                         entry[lvl] = '%s%s;%s-unassigned)' % (
                             null, taxon[3:][:-1], levels[lvl])
                     else:
                         entry[lvl] = '%s%s(%s-unassigned)' % (
                             null, taxon[3:], levels[lvl])
                 if entry[lvl] in parents:
                     #self.debug(parents[entry[lvl]])
                     if parent in parents[entry[lvl]]:
                         entry[lvl] = parents[entry[lvl]][parent]
                     else:
                         self.bugPrint(entry[lvl])
                         self.bugPrint(parents[entry[lvl]])
                         renamed.append(entry[lvl])
                         newtax = '%s%d' % (entry[lvl],
                                            renamed.count(entry[lvl]))
                         self.warnLog(
                             '%s had multiple parents (%s & %s) -> %s' %
                             (entry[lvl],
                              string.join(parents[entry[lvl]],
                                          '|'), parent, newtax))
                         parents[newtax] = {parent: newtax}
                         parents[entry[lvl]][parent] = newtax
                         entry[lvl] = newtax
                         self.deBug(parents[entry[lvl]])
                 elif parent:
                     parents[entry[lvl]] = {parent: entry[lvl]}
                 parent = entry[lvl]
                 if entry[lvl][3:] == taxon[3:]:
                     if (entry[lvl], taxon) not in dupnames:
                         dupnames.append((entry[lvl], taxon))
                 #self.bugPrint(entry[lvl])
                 taxon = entry[lvl]
             #self.debug(entry)
             #self.debug(parents)
         self.printLog('\r#SPF', 'Modifying SPF content complete.')
         dupnames.sort()
         for (dupA, dupB) in dupnames:
             self.warnLog('Duplicate taxa names: %s & %s' % (dupA, dupB))
         ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb.saveToFile(savefields=sdb.list['Fields'][1:])
         ### ~ [3] Compress to different taxonomic levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         compress = [
             'Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_5',
             'Level_6', 'Level_7', '#'
         ]
         dump = compress.pop(-1)
         rules = {'Observation Ids': 'list', dump: 'str'}
         sdb.dropField('Observation Ids')
         while compress:
             sdb.compress(compress,
                          rules=rules,
                          default='sum',
                          best=[],
                          joinchar='|')
             #if dump == '#':
             sdb.dropField(dump)
             sdb.saveToFile(
                 '%s.SPF.%s.%s.spf' %
                 (rje.baseFile(infile), compress[-1], levels[compress[-1]]))
             dump = compress.pop(-1)
             rules[dump] = 'list'
         return
     except:
         self.errorLog(self.zen())
         raise  # Delete this if method error not terrible