def endFork(self,fdict): ### Ends fork, tidies and sets new one running '''Ends fork, tidies and sets new one running.''' try:### ~ [1] ~ End and tidy current job ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if 'ResFile' in fdict: for resfile in fdict['ResFile']: fromfile = '%s.%s' % (fdict['FID'],resfile) if not rje.exists(fromfile): continue #self.warnLog('Results file %s missing!' % fromfile); continue tofile = '%s.%s' % (self.baseFile(),resfile) if rje.exists(tofile): open(tofile,'a').writelines(open(fromfile,'r').readlines()[1:]) else: rje.fileTransfer(fromfile,tofile) if 'Log' in fdict: if 'cmd' in fdict: open(self.log.info['LogFile'],'a').writelines(open(fdict['Log'],'r').readlines()[5:-1]) os.unlink(fdict['Log']) else: rje.fileTransfer(fdict['Log'],self.log.info['LogFile']) if self.getBool('LogFork'): self.printLog('#END','Fork %s ended: log content transferred' % fdict['PID']) self.printLog('#~~#','#~~#',timeout=False) #if self.dev(): self.deBug(fdict['Log']) #if self.dev(): self.deBug(rje.exists(fdict['Log'])) elif 'PID' in fdict and string.split('%s' % fdict['PID'])[0] == 'WAIT': pass else: self.printLog('#END','Fork %s ended.' % fdict['PID']) except IOError: if self.getInt('IOError') == 1: self.errorLog('Forker.endFork IOError limit reached'); raise else: self.int['IOError'] -= 1; self.errorLog('Forker.endFork') except: self.errorLog('Forker.endFork error') self.nextFork() # Carry on regardless
def restFullOutput(self,maxparsesize=0): ### Returns full REST output from file '''Returns full REST output from file.''' if rje.exists(self.getStr('RestIn')) and not self.force(): return open(self.getStr('RestIn'),'r').read() try: jobid = self.dict['Output']['jobid'] except: jobid = None rtxt = '%s\n' % self.dict['Output']['intro'] for rkey in self.list['RestKeys']: rtxt += '###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###\n' rtxt += '# %s: %s\n' % (rkey,self.dict['Outfile'][rkey]) #?# Q. Why only if jobid? Is it not always good to replace with content? if jobid and rje.exists(string.split(self.dict['Output'][rkey],'\n')[0]): ### File given instead of content rfile = string.split(self.dict['Output'][rkey],'\n')[0] fext = string.split(rfile,'.')[-1] nbytes = os.path.getsize(rfile) if nbytes > maxparsesize > 0: # Too large to parse otext = '%s is too large to return (%s > %s)' % (os.path.basename(rfile),rje.humanByteSize(nbytes),rje.humanByteSize(maxparsesize)) resturl = '%sretrieve&jobid=%s&rest=%s[&password=X]' % (self.getStr('RestURL'),jobid,rkey) rtxt += '%s in full output. Try %s.' % (otext,resturl) elif rfile.endswith('.png'): rtxt += '%s\n' % rfile #rtxt += 'Cannot return graphic in full output\n' #elif fext in ['htm','html']: # rtxt += 'Cannot return HTML in full output\n' else: outtxt = open(rfile,'r').read() if not outtxt.endswith('\n'): outtxt += '\n' rtxt += outtxt else: rtxt += '%s\n' % self.dict['Output'][rkey] return rtxt
def sgd2sp(self): ### Reformats yeast sequence names and outputs new data for GOPHER '''Reformats yeast sequence names and outputs new data for GOPHER.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### inseq = self.obj['SeqList'] uni = rje_uniprot.UniProt(self.log,self.cmd_list+['datout=None']) xref = self.db('XRef') self.dict['Rename'] = {} ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ufile = '%s.dat' % self.info['Basefile'] if os.path.exists(ufile) and not self.opt['Force']: uni.readUniProt(ufile,clear=True,cleardata=False) else: uni.readUniProt(clear=True,acclist=rje.sortKeys(xref.index('UniProt')),cleardata=False) uni.saveUniProt(ufile) ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniseq = {} for entry in uni.entries(): seq = entry.obj['Sequence'] uniseq[seq.info['AccNum']] = seq self.printLog('\r#USEQ','%s UniProt Sequences extracted (%s Ensembl AccNum)' % (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt'))))) ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### yseq = [] # List of YEAST sequence objects (sx,stot) = (0.0,inseq.seqNum()) for seq in inseq.seqs(): self.progLog('\r#SEQ','Reformatting sequence names: %.2f%%' % (sx/stot)); sx += 100.0 if seq.info['SpecCode'] != 'YEAST': continue yseq.append(seq) sgd = seq.info['AccNum']; newname = seq.info['Name'] try: for x in xref.indexEntries('EnsG',sgd): acc = x['UniProt'] if acc: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD'],acc) else: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD']); continue if acc not in uniseq: self.printLog('\r#UNIERR','Unable to find UniProt sequence %s (%s)' % (acc,sgd)); continue useq = uniseq[acc] if useq.info['Sequence'] != seq.info['Sequence']: self.printLog('\r#SEQERR','%s sequence <> %s sequence' % (sgd,acc)); continue nsplit = string.split(newname) nsplit[0] = '%s__%s' % (x['UniprotID'],acc) newname = string.join(nsplit) self.dict['Rename'][sgd] = acc break except: self.errorLog('%s problem' % sgd) seq.info['Name'] = newname seq.extractDetails(gnspacc=True) self.printLog('\r#SEQ','Reformatting sequence names complete.') ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.exists('%s.ygob.fas' % self.info['Basefile']): inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile']) if not rje.exists('%s.yeast.fas' % self.info['Basefile']): inseq.saveFasta(seqs=yseq,seqfile='%s.yeast.fas' % self.info['Basefile']) self.list['YeastSeq'] = inseq.accList(yseq) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def enrichment(self): ### Performs final enrichment analysis on SLiMDIP and Random datasets. ''' Performs final enrichment analysis on SLiMDIP and Random datasets. This requires the "real" predicted DMI from the slimDIP() method plus the randomised PPI datasets (from randomisePPI()). The latter are also run through the slimDIP() method to generate a background distribution. This is used directly to calculate enrichment "p-values" but also to generate a summary output file that can be used for generating histograms etc. with slimdip.R. This method needs: - slimdip table (or *.slimdip.tdt output file to load). - randbase.XX.tdt files. ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Check for required data else run preceding step(s) ~~~~~~~~~~~~~~~~~~~~~~~~~~ ## # Check for SLiMDIP table of real predicted DMI. Load in slimDIP() method if present and not in Database Table. if not self.db('slimdip') and not self.slimDIP(): return False # Check for randomised PPI datasets. These should be named randbase.XXX.tdt. for r in range(self.getInt('RandPPI')): randfile = '%s.%s.tdt' % (self.getStr('RandBase'),rje.preZero(r,self.getInt('RandPPI')-1)) if not rje.exists(randfile): if not self.randomisePPI(): return False break ### ~ [1] Perform Enrichment Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Perform SLiMDIP analysis of each random PPI dataset ~~~~~~~~~~~~~~~~~~~~~~~~~ ## return True except: self.errorLog('%s.enrichment() error' % self.prog()); return False
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Check and modify URL if required ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.getStr('RestIn').startswith('http:'): #!# Check for rest URL and add if missing #!# Split on & restcmd = string.split(self.getStr('RestIn'),'&') for i in range(len(restcmd)): if '=' not in restcmd[i]: continue (opt,value) = string.split(restcmd[i],'=',1) if value.startswith('file:'): # Conversion of cmd=file:FILE into cmd=CONTENT rfile = string.split(value,':',1)[1] #!# Consider adding max size constraint. Probably a URL size limit. if rje.exists(rfile): restcmd[i] = '%s=%s' % (opt,rje.chomp(string.join(open(rfile,'r').readlines(),'\\n'))) if '&' in restcmd[i]: self.warnLog('%s "&" => "+" conversions for %s.' % (rje.iStr(restcmd[i].count('&')),rfile)) restcmd[i] = string.replace(restcmd[i],'&','+') else: self.warnLog('File "%s" not found.' % rfile,quitchoice=True) self.setStr({'RestIn':string.join(restcmd,'&')}) ## ~ [1b] Direct Parsing of output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: # Convert to file self.setStr({'RestIn':rje.makePath(self.getStr('RestIn'),True)}) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def restOutput(self, outfmt=None, maxparsesize=0): ### Returns rest output for outfmt '''Returns rest output for outfmt.''' if not outfmt: outfmt = self.getStrLC('Rest') if not outfmt: return 'No REST output' if outfmt in self.dict['Output']: rfile = string.split(self.dict['Output'][outfmt], '\n')[0] if rje.exists(rfile): nbytes = os.path.getsize(rfile) if nbytes > maxparsesize > 0: # Too large to parse otext = '%s is too large to return (%s)' % ( os.path.basename(rfile), rje.humanByteSize(nbytes)) try: jobid = self.dict['Output']['jobid'] except: jobid = None resturl = '%sretrieve&jobid=%s&rest=%s[&password=X]' % ( self.getStr('RestURL'), jobid, outfmt) if not jobid or outfmt == self.getStrLC('Rest'): return 'ERROR: %s' % (otext) else: return '%s in full output. Try %s.' % (otext, resturl) else: return open(rfile, 'r').read() return self.dict['Output'][outfmt] elif outfmt in ['parse', 'format']: intro = '<pre>%s</pre>\n\n' % self.restOutput('intro') return intro elif outfmt in ['default', 'full']: return self.restFullOutput(maxparsesize) return 'No %s output generated.' % outfmt
def makePPI(self): ### Generates files for Human-HIV PPI analysis '''Generates files for Human-HIV PPI analysis.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % self.getStr('HIVSeq'),'autoload=T']) if not seqlist.seqs(): return False seqmap = seqlist.seqNameDic('Max') mdb = self.db('HHPIDMap') ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for hivacc in mdb.index('AccHIV'): # map HIV accession numbers on to sequences seqNameDic accnum = string.split(hivacc,'.')[0] hivseq = seqmap[accnum] # extract short HIV name from sequence ID hivgene = string.split(hivseq.shortName(),'_')[0].upper() # create directory named after HIV gene #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene)) rje.mkDir(self,'%s/' % hivgene,log=True) # copy human PPI files into directories, adding HIV gene ex = 0.0; etot = len(mdb.index('AccHIV')[hivacc]) for entry in mdb.indexEntries('AccHIV',hivacc): self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s %s PPI' % (rje.iStr(etot),hivgene)) pfile = self.getStr('PPIDir') + entry['Symbol'] + '.ppi.fas' if rje.exists(pfile): FAS = open('%s/%s.%s.ppi.fas' % (hivgene,hivgene.lower(),entry['Symbol']),'w') FAS.write('>%s\n%s\n' % (hivseq.info['Name'],hivseq.getSequence())) FAS.write(open(pfile,'r').read()) FAS.close() else: self.errorLog('Cannot find human PPI file for %s interactor "%s"' % (entry['HIV'],entry['Symbol']),printerror=False) self.printLog('\r#PPI','Generated human-HIV PPI fasta files for %s %s (%s) PPI.' % (rje.iStr(etot),entry['HIV'],hivgene)) except: self.errorLog('%s.makePPI error' % self); return False
def loadXRef(self): ### Load Identifier XRef Data '''Load Identifier XRef Data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if rje.exists('%s.xref.tdt' % self.info['Basefile']) and not self.opt['Force']: return self.db().addTable('%s.xref.tdt' % self.info['Basefile'], mainkeys=['#'], datakeys='All', name='XRef') if not rje.checkForFile(self.info['XRef']): return False changehead = { 'Ensembl Gene ID': 'EnsG', 'Ensembl Protein ID': 'EnsP', 'Associated Gene Name': 'Gene', 'Associated Gene DB': 'GeneDB', 'UniProt/SwissProt ID': 'UniprotID', 'UniProt/SwissProt Accession': 'UniProt', 'SGD Gene': 'SGD' } ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xref = self.db().addTable(self.info['XRef'], mainkeys='All', datakeys='All', name='XRef') for field in changehead: if field in xref.fields(): xref.renameField(field, changehead[field]) xref.saveToFile('%s.xref.tdt' % self.info['Basefile']) return xref except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def seqinObj(self, summarise=True, gapstats=True ): ### Returns the a SeqList object for the SeqIn file ''' Returns the a SeqList object for the SeqIn file. :return: self.obj['SeqIn'] ''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqbase = rje.baseFile(self.getStr('SeqIn'), strip_path=True) if not self.obj['SeqIn']: seqcmd = self.cmd_list if summarise: seqcmd += ['summarise=T', 'dna=T', 'raw=F'] gapstats = gapstats and ( self.force() or not rje.exists('%s.gaps.tdt' % seqbase)) if gapstats: seqcmd += ['gapstats'] self.obj['SeqIn'] = rje_seqlist.SeqList( self.log, seqcmd + ['autoload=T', 'seqmode=file', 'autofilter=F']) # sx = 0.0; stot = self.obj['SeqIn'].seqNum() # for seq in self.obj['SeqIn'].seqs(): # self.progLog('\r#CHECK','Checking sequences names: %.1f%%' % (sx/stot)); sx += 100.0 # if '|' in self.obj['SeqIn'].shortName(seq): # raise ValueError('Pipe "|" characters found in seqin=FILE names: will break program. Please rename and try again.') # self.printLog('\r#CHECK','Checking sequences names complete.') except ValueError: self.printLog('\r#CHECK', 'Checking sequences names aborted.') self.errorLog('DepthCharge input sequence error') raise except: self.errorLog('DepthCharge.seqinObj() error') return self.obj['SeqIn']
def enrichment( self ): ### Performs final enrichment analysis on SLiMDIP and Random datasets. ''' Performs final enrichment analysis on SLiMDIP and Random datasets. This requires the "real" predicted DMI from the slimDIP() method plus the randomised PPI datasets (from randomisePPI()). The latter are also run through the slimDIP() method to generate a background distribution. This is used directly to calculate enrichment "p-values" but also to generate a summary output file that can be used for generating histograms etc. with slimdip.R. This method needs: - slimdip table (or *.slimdip.tdt output file to load). - randbase.XX.tdt files. ''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Check for required data else run preceding step(s) ~~~~~~~~~~~~~~~~~~~~~~~~~~ ## # Check for SLiMDIP table of real predicted DMI. Load in slimDIP() method if present and not in Database Table. if not self.db('slimdip') and not self.slimDIP(): return False # Check for randomised PPI datasets. These should be named randbase.XXX.tdt. for r in range(self.getInt('RandPPI')): randfile = '%s.%s.tdt' % (self.getStr('RandBase'), rje.preZero( r, self.getInt('RandPPI') - 1)) if not rje.exists(randfile): if not self.randomisePPI(): return False break ### ~ [1] Perform Enrichment Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Perform SLiMDIP analysis of each random PPI dataset ~~~~~~~~~~~~~~~~~~~~~~~~~ ## return True except: self.errorLog('%s.enrichment() error' % self.prog()) return False
def endFork(self, fdict): ### Ends fork, tidies and sets new one running '''Ends fork, tidies and sets new one running.''' try: ### ~ [1] ~ End and tidy current job ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if 'ResFile' in fdict: for resfile in fdict['ResFile']: fromfile = '%s.%s' % (fdict['FID'], resfile) if not rje.exists(fromfile): continue #self.warnLog('Results file %s missing!' % fromfile); continue tofile = '%s.%s' % (self.baseFile(), resfile) if rje.exists(tofile): open(tofile, 'a').writelines( open(fromfile, 'r').readlines()[1:]) os.unlink(fromfile) else: rje.fileTransfer(fromfile, tofile) if 'Log' in fdict: if 'cmd' in fdict: open(self.log.info['LogFile'], 'a').writelines( open(fdict['Log'], 'r').readlines()[5:-1]) os.unlink(fdict['Log']) else: rje.fileTransfer(fdict['Log'], self.log.info['LogFile']) if self.getBool('LogFork'): self.printLog( '#END', 'Fork %s ended: log content transferred' % fdict['PID']) self.printLog('#~~#', '#~~#', timeout=False) #if self.dev(): self.deBug(fdict['Log']) #if self.dev(): self.deBug(rje.exists(fdict['Log'])) elif 'PID' in fdict and string.split( '%s' % fdict['PID'])[0] == 'WAIT': pass else: self.printLog('#END', 'Fork %s ended.' % fdict['PID'], log=self.getBool('LogFork'), screen=self.getBool('LogFork') or self.v() > 1) except IOError: if self.getInt('IOError') == 1: self.errorLog('Forker.endFork IOError limit reached') raise else: self.int['IOError'] -= 1 self.errorLog('Forker.endFork') except: self.errorLog('Forker.endFork error') self.nextFork() # Carry on regardless
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup Database ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log,self.cmd_list) db = self.db().addEmptyTable('ProDigIS',['AccNum','Protease','PepCount'],['AccNum','Protease']) if self.getInt('MinPepLen') > 0: db.addField('MinPepLen') if self.getBool('NRPep'): db.addField('NRPep') if rje.exists(self.getStr('Source')): fdb = self.db().addTable(self.getStr('Source'),mainkeys=['AccNum'],name='Source') fdb.addField('File') fdb.addField('ProtMWt') else: fdb = self.db().addEmptyTable('Source',['AccNum','File','ProtMWt'],['AccNum']) for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i) if self.getBool('PepMWt'): for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i*100.0) ### ~ [2] Load Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autoload=F']) self.obj['SeqList'].seq = fullseq = [] for seqfile in self.list['SeqFiles']: file = rje.baseFile(seqfile,True) seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % seqfile,'autoload=T']) fullseq += seqlist.seqs() for seq in seqlist.seqs(): accnum = seq.getStr('AccNum') try: entry = fdb.data()[accnum] if 'File' in entry and entry['File']: self.errorLog('%s found in %s AND %s!' % (accnum,entry['File'],file),printerror=False) entry['File'] = file entry['ProtMWt'] = seq.MWt() except: entry = {'AccNum':accnum,'File':file,'ProtMWt':seq.MWt()} fdb.addEntry(entry) self.deBug(fdb.dict['Data'][seq.getStr('AccNum')]) self.printLog('#SEQ','%s sequences to analyse in total' % rje.iLen(fullseq)) fdb.fillBlanks() ### ~ [3] Setup Peptide Probabilities ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self._peptideProbabilities(): db.addField('LenExp','PepCount'); if self.getBool('PepMWt'): db.addField('MWtExp','LenExp'); db.addField('Len7Exp','MWtExp') else: db.addField('Len7Exp','LenExp') db.addField('Len37','Len7Exp') if self.getBool('PepMWt'): db.addField('Len5','MWtExp'); db.addField('MWt5','Len5') db.addField('Len3','MWtExp'); db.addField('MWt3','Len3') else: db.addField('Len5','LenExp'); db.addField('Len3','LenExp') return ### ~ [4] Temp GABLAM Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdb = self.db().addTable('Chlam_Pos.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GABLAM') ndb = self.db().addTable('Chlam_Neg.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GNeg') self.db().mergeTables(gdb,ndb,overwrite=True,matchfields=True) gdb.renameField('Qry','AccNum') tmp = self.db().joinTables(name='blast',join=[('Source','AccNum'),('GABLAM','AccNum')],newkey=['AccNum','File'],keeptable=False) tmp.saveToFile() tmp.compress(['File'],default='mean') tmp.dropFields(['AccNum']) tmp.info['Name'] = 'blastsum' tmp.saveToFile() except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def haqBatch(self,force=False): ### Generates Batch and INI files for HAQESAC runs '''Generates Batch and INI files for HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True) inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'],wholepath=True) if force or self.force() or not rje.exists(batfile) or not rje.exists(inifile): rje.backup(self,batfile); rje.backup(self,inifile) else: return self.printLog('#HAQBAT','HAQESAC Batch files found.') ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### haqcmd = [] for cmd in self.cmd_list: if cmd[:4].lower() != 'ini=': haqcmd.append(cmd) if self.opt['MultiHAQ']: haqcmd += ['multihaq=T','force=F'] open(inifile,'w').write(string.join(haqcmd,'\n')) ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): acc = seq.info['AccNum'] haqcmd = ['seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc] open(batfile,'a').write('python %shaqesac.py %s\n' % (self.info['Path'],string.join(haqcmd))) self.printLog('#HAQBAT','HAQESAC Batch file output to %s' % batfile) except: self.errorLog('Major problem with MultiHAQ.haqBatch',quitchoice=True)
def setup(self): ### Main class setup method. ''' Main class setup method. This will load sequences into a SeqList object, gaps into a 'gaps' database table, and check or generate a PAF file from the mapped long reads. ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log, self.cmd_list) if not self.getStrLC('SeqIn'): raise ValueError('seqin=FILE must be set') if not rje.exists(self.getStr('SeqIn')): raise IOError('Unable to read seqin=FILE: "{0}"'.format( self.getStr('SeqIn'))) seqbase = rje.baseFile(self.getStr('SeqIn'), strip_path=True) if not self.getStrLC('Basefile'): self.baseFile(seqbase) if rje.checkForFiles(filelist=['.gaps.tdt'], basename=seqbase, log=self.log) and not self.force(): self.cmd_list.append('gapstats=F') else: self.cmd_list.append('gapstats=T') seqin = self.seqinObj() gapdb = self.db().addTable('%s.gaps.tdt' % seqbase, mainkeys=['seqname', 'start', 'end'], name='gaps', ignore=[], expect=True) gapdb.dataFormat({'start': 'int', 'end': 'int'}) ### ~ [2] PAF File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.getStrLC('PAF'): self.setStr({'PAF': self.baseFile() + '.paf'}) pfile = self.getStr('PAF') if self.force() or not rje.exists(pfile): paf = rje_paf.PAF(self.log, self.cmd_list) paf.longreadMinimapPAF(pfile) if not rje.exists(self.getStr('PAF')): raise IOError( 'Unable to read or create PAF file: {0}'.format(pfile)) return True except: self.errorLog('Problem during %s setup.' % self.prog()) return False # Setup failed
def haqBatch( self, force=False): ### Generates Batch and INI files for HAQESAC runs '''Generates Batch and INI files for HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = rje.makePath('%shaqesac.bat' % self.info['HaqDir'], wholepath=True) inifile = rje.makePath('%shaqesac.ini' % self.info['HaqDir'], wholepath=True) if force or self.force( ) or not rje.exists(batfile) or not rje.exists(inifile): rje.backup(self, batfile) rje.backup(self, inifile) else: return self.printLog('#HAQBAT', 'HAQESAC Batch files found.') ### ~ [1] Make INI File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### haqcmd = [] for cmd in self.cmd_list: if cmd[:4].lower() != 'ini=': haqcmd.append(cmd) if self.opt['MultiHAQ']: haqcmd += ['multihaq=T', 'force=F'] open(inifile, 'w').write(string.join(haqcmd, '\n')) ### ~ [2] Make Batch file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): acc = seq.info['AccNum'] haqcmd = [ 'seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc ] open(batfile, 'a').write('python %shaqesac.py %s\n' % (self.info['Path'], string.join(haqcmd))) self.printLog('#HAQBAT', 'HAQESAC Batch file output to %s' % batfile) except: self.errorLog('Major problem with MultiHAQ.haqBatch', quitchoice=True)
def makePPI(self): ### Generates files for Human-HIV PPI analysis '''Generates files for Human-HIV PPI analysis.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%s' % self.getStr('HIVSeq'), 'autoload=T']) if not seqlist.seqs(): return False seqmap = seqlist.seqNameDic('Max') mdb = self.db('HHPIDMap') ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for hivacc in mdb.index('AccHIV'): # map HIV accession numbers on to sequences seqNameDic accnum = string.split(hivacc, '.')[0] hivseq = seqmap[accnum] # extract short HIV name from sequence ID hivgene = string.split(hivseq.shortName(), '_')[0].upper() # create directory named after HIV gene #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene)) rje.mkDir(self, '%s/' % hivgene, log=True) # copy human PPI files into directories, adding HIV gene ex = 0.0 etot = len(mdb.index('AccHIV')[hivacc]) for entry in mdb.indexEntries('AccHIV', hivacc): self.progLog( '\r#PPI', 'Generating human-HIV PPI fasta files for %s %s PPI' % (rje.iStr(etot), hivgene)) pfile = self.getStr( 'PPIDir') + entry['Symbol'] + '.ppi.fas' if rje.exists(pfile): FAS = open( '%s/%s.%s.ppi.fas' % (hivgene, hivgene.lower(), entry['Symbol']), 'w') FAS.write('>%s\n%s\n' % (hivseq.info['Name'], hivseq.getSequence())) FAS.write(open(pfile, 'r').read()) FAS.close() else: self.errorLog( 'Cannot find human PPI file for %s interactor "%s"' % (entry['HIV'], entry['Symbol']), printerror=False) self.printLog( '\r#PPI', 'Generated human-HIV PPI fasta files for %s %s (%s) PPI.' % (rje.iStr(etot), entry['HIV'], hivgene)) except: self.errorLog('%s.makePPI error' % self) return False
def loadXRef(self): ### Load Identifier XRef Data '''Load Identifier XRef Data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if rje.exists('%s.xref.tdt' % self.info['Basefile']) and not self.opt['Force']: return self.db().addTable('%s.xref.tdt' % self.info['Basefile'],mainkeys=['#'],datakeys='All',name='XRef') if not rje.checkForFile(self.info['XRef']): return False changehead = {'Ensembl Gene ID':'EnsG','Ensembl Protein ID':'EnsP','Associated Gene Name':'Gene', 'Associated Gene DB':'GeneDB','UniProt/SwissProt ID':'UniprotID', 'UniProt/SwissProt Accession':'UniProt','SGD Gene':'SGD'} ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xref = self.db().addTable(self.info['XRef'],mainkeys='All',datakeys='All',name='XRef') for field in changehead: if field in xref.fields(): xref.renameField(field,changehead[field]) xref.saveToFile('%s.xref.tdt' % self.info['Basefile']); return xref except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def depthChargeForker(self): ### Main DepthCharge forking method ''' Work through each sequence and fork it out for DepthCharge analysis. ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqin = self.seqinObj() self.list['ToFork'] = seqin.list['Seq'][0:] resfile = '{0}.depthcharge.tdt'.format(self.baseFile()) if self.force(): rje.backup(resfile, appendable=False) elif rje.exists(resfile): ddb = self.db().addTable(resfile, ['seqname', 'start', 'end', 'type']) ddb.dataFormat({'start': 'int', 'end': 'int'}) complete = ddb.indexDataList('type', 'all', 'seqname') if complete: cx = 0 for seq in self.list['ToFork'][0:]: if seqin.shortName(seq) in complete: self.list['ToFork'].remove(seq) cx += 1 if cx: self.printLog( '#SKIP', 'Skipping {0} previously processed sequences (force=F)' .format(rje.iStr(cx))) if not self.list['ToFork']: self.printLog( '#CHARGE', 'All sequences previously processed (force=F)') return ddb while len(self.list['Forked']) < self.getNum( 'Forks') and self.list['ToFork']: self.nextFork() ### ~ [2] ~ Work through each sequence and fork out ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.forking() self.printLog('#FORK', 'Forking of %s jobs completed.' % (rje.iStr(seqin.seqNum())), log=self.getBool('LogFork')) ddb = self.db().addTable(resfile, ['seqname', 'start', 'end', 'type'], replace=True) ddb.dataFormat({'start': 'int', 'end': 'int'}) return ddb except: self.errorLog('%s.depthChargeForker error' % self.prog())
def parse(self): ### Parse REST file into dictionaries '''Parse REST file into dictionaries.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['RestKeys'] = [] rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True)) if rje.exists(self.getStr('RestIn')): restin = open(self.getStr('RestIn'),'r').read() elif rje.matchExp('^(\d+)$',self.getStr('RestIn')): url = '%sretrieve&jobid=%s&password=%s' % (self.getStr('RestURL'),self.getStr('RestIn'),self.getStr('Password')) if self.getBool('PureAPI') and self.getStrLC('Rest'): url += '&rest=%s' % (self.getStr('Rest')) else: url += '&rest=full' restin = urllib2.urlopen(url).read() if self.getBool('PureAPI'): return restin else: raise IOError('%s not found!' % self.getStr('RestIn')) jobid = None ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for restdata in string.split(restin,'###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###\n'): if not jobid: self.dict['Output']['intro'] = restdata prog = rje.matchExp('Output for (\S+)',restdata)[0] self.dict['Output']['prog'] = prog jobid = rje.matchExp('JobID: (\d+)',restdata)[0] self.dict['Output']['jobid'] = jobid if not self.getStrLC('RestBase'): rbase = '%s%s' % (self.getStr('RestOutDir'),jobid) self.dict['Outfile']['jobid'] = '%s.jobid' % (rbase) continue restlines = string.split(restdata,'\n') rparse = string.split(restlines.pop(0)) if rparse[0] != '#': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue if rparse[1][-1] != ':': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue rkey = rparse[1][:-1] try: rfile = '%s.%s' % (rbase,rje.baseFile(rparse[2],strip_path=True,keepext=True)) except: rfile = '' if not rfile: rfile = '%s.%s' % (rbase,rkey) rfile = string.replace(rfile,'%s.%s.' % (jobid,jobid),'%s.' % jobid) self.dict['Output'][rkey] = string.join(restlines,'\n') self.dict['Outfile'][rkey] = rfile self.list['RestKeys'].append(rkey) self.printLog('#PARSE','Parsed %s: %d REST outputs.' % (self.getStr('RestIn'),len(self.dict['Output']))) return True except: self.errorLog('%s.parse error' % self); return False
def restOutput(self,outfmt=None,maxparsesize=0,asjson=False): ### Returns rest output for outfmt '''Returns rest output for outfmt.''' if not outfmt: outfmt = self.getStrLC('Rest') if not outfmt: self.jsonText('No REST output',asjson) if outfmt in self.dict['Output']: rfile = string.split(self.dict['Output'][outfmt],'\n')[0] if rje.exists(rfile): fext = string.split(rfile,'.')[-1] if fext in ['png']: self.debug(rfile) self.jsonText(rfile,asjson) nbytes = os.path.getsize(rfile) if nbytes > maxparsesize > 0: # Too large to parse otext = '%s is too large to return (%s > %s)' % (os.path.basename(rfile),rje.humanByteSize(nbytes),rje.humanByteSize(maxparsesize)) try: jobid = self.dict['Output']['jobid'] except: jobid = None resturl = '%sretrieve&jobid=%s&rest=%s[&password=X]' % (self.getStr('RestURL'),jobid,outfmt) if not jobid or outfmt == self.getStrLC('Rest'): return self.jsonText('ERROR: %s' % (otext),asjson) else: return self.jsonText('%s in full output. Try %s.' % (otext,resturl),asjson) else: delimit = rje.delimitFromExt(filename=rfile,write=False) if asjson and delimit in [',','\t']: jtext = [] for rline in open(rfile,'r').readlines(): jtext.append(json.dumps(rje.readDelimit(rline,delimit))) return '[%s]' % string.join(jtext,',\n ') #!# Add json parsing of fasta files? else: outtxt = open(rfile,'r').read() if not outtxt.endswith('\n'): outtxt += '\n' return self.jsonText(outtxt,asjson) elif asjson and outfmt in self.dict['Outfile']: pass #!# Sort out json formatting here based on file extension! return self.dict['Output'][outfmt] elif outfmt in ['parse','format']: intro = '<pre>%s</pre>\n\n' % self.restOutput('intro') return self.jsonText(intro,asjson) elif outfmt in ['default','full']: return self.jsonText(self.restFullOutput(maxparsesize),asjson) elif outfmt in ['restkeys','outputs']: return string.join(self.list['RestKeys']+[''],'\n') return self.jsonText('No %s output generated.' % outfmt,asjson)
def rmdKnit(self, rmdfile, document='html', stdout=False): ### Knit Rmd to HTML/PDF file ''' Knit Rmd to HTML/PDF file. >> rmdfile:str = R markdown file to knit >> document:str ['html'] = type of document to knit into << success:bool = whether output is generated ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outfile = '%s.%s' % (rje.baseFile(rmdfile), document) rcmd = 'Rscript -e \'library(rmarkdown); rmarkdown::render("%s", "%s_document")\'' % ( rmdfile, document) self.printLog('#RCMD', rcmd) rcmd += ' 2>&1' if self.v() < 2 and not stdout: os.popen(rcmd).read() else: self.progLog('#RCMD', 'Knitting %s...' % (rmdfile)) os.system(rcmd) success = rje.exists(outfile) if success: self.printLog('#RCMD', '%s generated from %s' % (outfile, rmdfile)) else: self.printLog( '#SYS', 'If pandoc error, try setting global variable: export RSTUDIO_PANDOC=/Applications/RStudio.app/Contents/MacOS/pandoc' ) self.printLog( '#SYS', 'If no pandoc error, check that required libraries in %s are installed' % rmdfile) raise IOError('%s not created' % outfile) return True except: self.errorLog('%s.rmdKnit error: check R installation' % self.prog()) return False
def exonerate(self,qryfas, genome, model,exonerate='exonerate',bestn=0): ''' Runs exonerate and parses output into lists for processing. { query: {'gff':[outputlines], 'cigar':[outputlines], 'alignment':[outputlines], 'vulgar':[[headerlist], {header:value}, {header:value}, ...] } ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### EXFILE = None exfile = '%s.%s' % (self.baseFile(),model) # Used in memsaver mode query_dic = {} header_list = ['query_id', 'query_start', 'query_end', 'query_strand', 'target_id', 'target_start', 'target_end', 'target_strand', 'score', '<label, query_length, target_length> triplets'] excmd = [exonerate, qryfas, genome, '--showtargetgff', '--showcigar'] if model: excmd += ['--model', model] if bestn: excmd += ['--bestn', '%d' % bestn] if self.getStrLC('ExOpt'): excmd += string.split(self.getStr('ExOpt')) self.printLog('#RUN',string.join(excmd)) extext = [] if self.getBool('MemSaver'): gzfile = '%s.gz' % exfile if rje.exists(gzfile): self.gUnzip(gzfile) if rje.exists(exfile) and not self.force(): self.printLog('#EXFILE','Found %s (force=F). Assuming complete.' % exfile) else: rje.backup(self,exfile) self.printLog('#SAVER','memsaver=T: Exonerate output directed to %s.' % exfile) EXFILE = open(exfile,'w') if subprocess.call(excmd, stdout=EXFILE): raise IOError('Exonerate call did not complete!') EXFILE.close() self.printLog('#EXFILE','%s generated.' % exfile) EXFILE = open(exfile,'r') else: extext = Popen(excmd, stdout=PIPE).stdout.readlines() output_format = '' while extext or EXFILE: #line = process.stdout.readline().rstrip() if EXFILE: line = EXFILE.readline() if not line: break line = rje.chomp(line) else: line = rje.chomp(extext.pop(0)) if line: if line.startswith(' Query:'): query = line.split(':', 1)[1].split(' ')[1] #for q in rje.sortKeys(query_dic): # self.bugPrint('%s: %s' % (q,rje.sortKeys(query_dic[q]))) #self.debug(query) if line == 'C4 Alignment:': output_format = 'alignment' elif line == '# --- START OF GFF DUMP ---': output_format = 'gff' elif line.startswith('vulgar:'): output_format = 'vulgar' fields = line.split(' ', 10)[1:] if output_format in query_dic[query]: query_dic[query][output_format].append({}) else: query_dic[query][output_format] = [header_list, {}] for header, field in zip(header_list, fields): query_dic[query][output_format][-1][header] = field #self.debug(query_dic[query][output_format]) elif line.startswith('cigar:'): output_format = 'cigar' if output_format in query_dic[query]: query_dic[query][output_format].append(line.replace('cigar: ', '')) else: query_dic[query][output_format] = [line.replace('cigar: ', '')] elif line == '------------' or line.startswith('Command line:') or line.startswith('Hostname:') or line == '# --- END OF GFF DUMP ---' or line == '#' or line.startswith('-- completed exonerate analysis'): pass elif output_format: if query in query_dic: if output_format in query_dic[query]: query_dic[query][output_format].append(line) else: query_dic[query][output_format] = [line] else: query_dic[query] = {output_format:[line]} #elif process.poll() is not None: # break elif output_format == 'alignment': try: query_dic[query][output_format].append(line) except: pass self.vPrint(line,v=1) if EXFILE: EXFILE.close() if self.getBool('Cleanup'): os.unlink(exfile) self.printLog('#CLEAN','%s deleted.' % exfile) elif self.getBool('GZip'): self.gZip(exfile) return query_dic except: self.errorLog('%s.exonerate error' % self.prog()); raise
def multiHAQ(self,secondrun=False): ### Executes main HAQESAC runs '''Executes main HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### finalrun = secondrun == self.opt['MultiHAQ'] # Whether this is the manual HAQESAC phase qryacc = self.obj['SeqList'].accList() # Full list of Query accession numbers processed = [] # List of processed sequence accession numbers ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True) if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue inseqx = rje_seq.SeqCount(self,infile) if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm)) time.sleep(60*tm); tm += 1 pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile) if not pickled: try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit') except: self.printLog('#PICKLE','No %s pickle.' % (acc,tm)) self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile: if runhaqesac: haqcmd = ['ini=haqesac.ini','seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc, 'newlog=F'] self.printLog('#HAQ','Running HAQESAC for %s - will have own log etc.' % seq.shortName(),log=False) os.chdir(self.info['HaqDir']) info = haqesac.makeInfo() haqcmd = rje.getCmdList(haqcmd,info=info) out = rje.Out(cmd_list=haqcmd) # Sets up Out object for controlling output to screen out.printIntro(info) # Prints intro text using details from Info object haqlog = rje.setLog(info,out,haqcmd) # Sets up Log object for controlling log file output try: haqesac.HAQESAC(log=haqlog, cmd_list=haqcmd).run(setobjects=True) except: os.chdir(self.info['RunPath']) if self.i() >= 0 and rje.yesNo('Problem with %s HAQESAC run. Abort?' % seq.shortName()): raise KeyboardInterrupt os.chdir(self.info['RunPath']) if finalrun: self.printLog('#HAQ','HAQESAC final round run for %s' % seq.shortName()) else: self.printLog('#HAQ','HAQESAC first round run for %s' % seq.shortName()) ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not self.opt['ScreenQry'] or not finalrun: continue qacclist = [] for qacc in rje_seq.SeqList(self.log,['seqin=%s' % infile,'autoload=T','autofilter=F']).accList(): if qacc in qryacc and qacc != acc: qacclist.append(qacc) if qacc in qryacc and qacc not in processed: processed.append(qacc) self.printLog('#QRY','%d other queries found in %s: [%s]' % (len(qacclist),infile,string.join(qacclist,'; '))) self.printLog('#QRY','%d of %d queries processed' % (len(processed),self.seqNum())) ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not finalrun: self.printLog('#MULTI','Executing second round of multiHAQ'); self.multiHAQ(True) except: self.errorLog('Major problem with MultiHAQ.multiHAQ',quitchoice=True)
def farmHAQ(self): ### Uses SLiMFarmer to farm out the HAQESAC runs '''Uses SLiMFarmer to farm out the HAQESAC runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = os.path.abspath(rje.makePath('%shaqesac.bat' % self.info['HaqDir'],wholepath=True)) self.printLog('#FARM',batfile) if not rje.exists(batfile): raise IOError('Cannot find %s' % batfile) farmcmd = ['subjobs=%s' % batfile,'farm=batch','qsub=F','i=-1','runpath=%s' % os.path.abspath(self.info['HaqDir'])] if self.opt['MultiHAQ']: haqfarm = ['First round','Second round'] else: haqfarm = ['Complete run'] ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for farmrun in haqfarm: self.printLog('#CHDIR','Changing directory for %s farming: %s' % (farmrun,self.info['HaqDir'])) os.chdir(self.info['HaqDir']) farmer = slimfarmer.SLiMFarmer(self.log,self.cmd_list+farmcmd) farmer.slimFarm() os.chdir(self.info['RunPath']) self.printLog('#CHDIR','Changed directory post-farming: %s' % self.info['RunPath']) self.printLog('#FARM','HAQESAC %s farming complete.' % farmrun) return True #!# Add identifying and skipping of partial runs. for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and (self.opt['AutoSkip'] or (self.i() >=0 and rje.yesNo('%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog('#SKIP','%s already covered by previous HAQESAC: Skipped' % seq.shortName()); continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'],acc),wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],acc),wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'],acc),wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'],acc),wholepath=True) if not os.path.exists(infile): self.printLog('#SKIP','%s input file %s not found: Skipped' % (seq.shortName(),infile)); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkzfile,infile) == pkzfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue if not finalrun and not self.opt['Force'] and rje.isYounger(pkfile,infile) == pkfile: self.printLog('#SKIP','%s run detected: Skipped' % seq.shortName()); continue inseqx = rje_seq.SeqCount(self,infile) if inseqx < 2: self.printLog('#SKIP','Only one sequence found in %s: Skipped' % (infile)); continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile); tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog('#WAIT','No %s pickle. Sleeping for %d min.' % (acc,tm)) time.sleep(60*tm); tm += 1 pickled = os.path.exists(pkfile) or os.path.exists('%s.gz' % pkfile) if not pickled: try: rje.choice('Press <ENTER> to try again, or <CTRL+C> to Quit') except: self.printLog('#PICKLE','No %s pickle.' % (acc,tm)) self.printLog('\r#MULTI','Exiting multiHAQ "Chaser" run.'); return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'],acc),wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog('#SKIP','Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False except: os.chdir(self.info['RunPath']) self.errorLog('Major problem with MultiHAQ.farmHAQ',quitchoice=True)
def farmHAQ(self): ### Uses SLiMFarmer to farm out the HAQESAC runs '''Uses SLiMFarmer to farm out the HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### batfile = os.path.abspath( rje.makePath('%shaqesac.bat' % self.info['HaqDir'], wholepath=True)) self.printLog('#FARM', batfile) if not rje.exists(batfile): raise IOError('Cannot find %s' % batfile) farmcmd = [ 'subjobs=%s' % batfile, 'farm=batch', 'qsub=F', 'i=-1', 'runpath=%s' % os.path.abspath(self.info['HaqDir']) ] if self.opt['MultiHAQ']: haqfarm = ['First round', 'Second round'] else: haqfarm = ['Complete run'] ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for farmrun in haqfarm: self.printLog( '#CHDIR', 'Changing directory for %s farming: %s' % (farmrun, self.info['HaqDir'])) os.chdir(self.info['HaqDir']) farmer = slimfarmer.SLiMFarmer(self.log, self.cmd_list + farmcmd) farmer.slimFarm() os.chdir(self.info['RunPath']) self.printLog( '#CHDIR', 'Changed directory post-farming: %s' % self.info['RunPath']) self.printLog('#FARM', 'HAQESAC %s farming complete.' % farmrun) return True #!# Add identifying and skipping of partial runs. for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and ( self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo( '%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog( '#SKIP', '%s already covered by previous HAQESAC: Skipped' % seq.shortName()) continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc), wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc), wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'], acc), wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'], acc), wholepath=True) if not os.path.exists(infile): self.printLog( '#SKIP', '%s input file %s not found: Skipped' % (seq.shortName(), infile)) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkzfile, infile) == pkzfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkfile, infile) == pkfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue inseqx = rje_seq.SeqCount(self, infile) if inseqx < 2: self.printLog( '#SKIP', 'Only one sequence found in %s: Skipped' % (infile)) continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog( '#WAIT', 'No %s pickle. Sleeping for %d min.' % (acc, tm)) time.sleep(60 * tm) tm += 1 pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) if not pickled: try: rje.choice( 'Press <ENTER> to try again, or <CTRL+C> to Quit' ) except: self.printLog('#PICKLE', 'No %s pickle.' % (acc, tm)) self.printLog('\r#MULTI', 'Exiting multiHAQ "Chaser" run.') return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False except: os.chdir(self.info['RunPath']) self.errorLog('Major problem with MultiHAQ.farmHAQ', quitchoice=True)
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### infile = self.getStr('InFile') while not rje.exists(infile): infile = rje.choice('File "%s" not found. Input file name? (Blank to quit):' % infile) if not infile: return self.printLog('#QUIT','Execution terminated!') db = rje_db.Database(self.log,self.cmd_list) db.basefile(rje.baseFile(infile)) sdb = db.addTable(infile,mainkeys='#',delimit='\t',name='SPF.Mod') levels = {'Level_1':'k','Level_2':'p','Level_3':'c','Level_4':'o','Level_5':'f','Level_6':'g','Level_7':'s'} # k__Bacteria p__Proteobacteria c__Alphaproteobacteria o__Rhodospirillales f__Rhodospirillaceae g__ s__ denovo44 # Unassigned unclassified unclassified unclassified unclassified unclassified unclassified denovo49 ### ~ [1] Modify Text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dupnames = [] parents = {} # Parent for each term renamed = [] ex = 0.0; etot = sdb.entryNum() for entry in sdb.entries(): self.progLog('\r#SPF','Modifying SPF content: %.1f%%' % (ex/etot)); ex += 100.0 taxon = '' parent = '' #self.debug(entry) for lvl in ['Level_1','Level_2','Level_3','Level_4','Level_5','Level_6','Level_7']: entry[lvl] = string.replace(entry[lvl],'unidentified','unclassified') #entry[lvl] = string.replace(entry[lvl],'Incertae_sedis','Incertae_sedis-%s' % levels[lvl]) null = '%s__' % levels[lvl] #self.bugPrint(null) #self.bugPrint(entry[lvl]) if entry[lvl] in [null,'Unassigned','unclassified','%sunclassified' % null,'%sunidentified' % null,'%sunculturedfungus' % null,'%sIncertae_sedis' % null,'%sunclassified_sp.' % null]: if not taxon or taxon.endswith('unclassified'): entry[lvl] = '%sunclassified' % null #elif taxon.endswith('unassigned)'): entry[lvl] = '%s%s' % (null,taxon[3:]) #elif taxon.endswith('unassigned)'): entry[lvl] = '%s(%s;%s-unassigned)' % (null,string.split(taxon,'(')[1][:-1],levels[lvl]) elif taxon.endswith('unassigned)'): entry[lvl] = '%s%s;%s-unassigned)' % (null,taxon[3:][:-1],levels[lvl]) else: entry[lvl] = '%s%s(%s-unassigned)' % (null,taxon[3:],levels[lvl]) if entry[lvl] in parents: #self.debug(parents[entry[lvl]]) if parent in parents[entry[lvl]]: entry[lvl] = parents[entry[lvl]][parent] else: self.bugPrint(entry[lvl]) self.bugPrint(parents[entry[lvl]]) renamed.append(entry[lvl]) newtax = '%s%d' % (entry[lvl],renamed.count(entry[lvl])) self.warnLog('%s had multiple parents (%s & %s) -> %s' % (entry[lvl],string.join(parents[entry[lvl]],'|'),parent,newtax)) parents[newtax] = {parent:newtax} parents[entry[lvl]][parent] = newtax entry[lvl] = newtax self.deBug(parents[entry[lvl]]) elif parent: parents[entry[lvl]] = {parent:entry[lvl]} parent = entry[lvl] if entry[lvl][3:] == taxon[3:]: if (entry[lvl],taxon) not in dupnames: dupnames.append((entry[lvl],taxon)) #self.bugPrint(entry[lvl]) taxon = entry[lvl] #self.debug(entry) #self.debug(parents) self.printLog('\r#SPF','Modifying SPF content complete.') dupnames.sort() for (dupA,dupB) in dupnames: self.warnLog('Duplicate taxa names: %s & %s' % (dupA,dupB)) ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb.saveToFile(savefields=sdb.list['Fields'][1:]) ### ~ [3] Compress to different taxonomic levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### compress = ['Level_1','Level_2','Level_3','Level_4','Level_5','Level_6','Level_7','#'] dump = compress.pop(-1) rules = {'Observation Ids':'list',dump:'str'} sdb.dropField('Observation Ids') while compress: sdb.compress(compress,rules=rules,default='sum',best=[],joinchar='|') #if dump == '#': sdb.dropField(dump) sdb.saveToFile('%s.SPF.%s.%s.spf' % (rje.baseFile(infile),compress[-1],levels[compress[-1]])) dump = compress.pop(-1); rules[dump] = 'list' return except: self.errorLog(self.zen()) raise # Delete this if method error not terrible
def multiHAQ(self, secondrun=False): ### Executes main HAQESAC runs '''Executes main HAQESAC runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### finalrun = secondrun == self.opt[ 'MultiHAQ'] # Whether this is the manual HAQESAC phase qryacc = self.obj['SeqList'].accList( ) # Full list of Query accession numbers processed = [] # List of processed sequence accession numbers ### ~ [1] Peform HAQESAC runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in self.seqs(): ## ~ [1a] Check AutoSkip ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acc = seq.info['AccNum'] if finalrun and acc in processed and ( self.opt['AutoSkip'] or (self.i() >= 0 and rje.yesNo( '%s already covered by previous HAQESAC. Skip?' % seq.shortName()))): self.printLog( '#SKIP', '%s already covered by previous HAQESAC: Skipped' % seq.shortName()) continue ## ~ [1b] Check Whether to run (re-runs and low sequence number) ~~~~~~~~~~~~~~~~~~ ## logfile = rje.makePath('%s%s.log' % (self.info['HaqDir'], acc), wholepath=True) infile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], acc), wholepath=True) pkfile = rje.makePath('%s%s.pickle' % (self.info['HaqDir'], acc), wholepath=True) pkzfile = rje.makePath('%s%s.pickle.gz' % (self.info['HaqDir'], acc), wholepath=True) if not os.path.exists(infile): self.printLog( '#SKIP', '%s input file %s not found: Skipped' % (seq.shortName(), infile)) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkzfile, infile) == pkzfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue if not finalrun and not self.opt['Force'] and rje.isYounger( pkfile, infile) == pkfile: self.printLog('#SKIP', '%s run detected: Skipped' % seq.shortName()) continue inseqx = rje_seq.SeqCount(self, infile) if inseqx < 2: self.printLog( '#SKIP', 'Only one sequence found in %s: Skipped' % (infile)) continue ## ~ [1c] Pause if running in Chaser Mode and no Pickle ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) tm = 0 while secondrun and self.opt['Chaser'] and not pickled: self.progLog( '#WAIT', 'No %s pickle. Sleeping for %d min.' % (acc, tm)) time.sleep(60 * tm) tm += 1 pickled = os.path.exists(pkfile) or os.path.exists( '%s.gz' % pkfile) if not pickled: try: rje.choice( 'Press <ENTER> to try again, or <CTRL+C> to Quit' ) except: self.printLog('#PICKLE', 'No %s pickle.' % (acc, tm)) self.printLog('\r#MULTI', 'Exiting multiHAQ "Chaser" run.') return ## ~ [1d] Run HAQESAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## runhaqesac = True pngfile = rje.makePath('%s%s.png' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(pngfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % pngfile) runhaqesac = False ancfile = rje.makePath('%s%s.anc.fas' % (self.info['HaqDir'], acc), wholepath=True) if not self.force() and rje.exists(ancfile): self.printLog( '#SKIP', 'Found evidence of completed run: %s (force=F). Skipping.' % ancfile) runhaqesac = False #if not finalrun or self.opt['Force'] or rje.isYounger(logfile,nsfile) != logfile: if runhaqesac: haqcmd = [ 'ini=haqesac.ini', 'seqin=%s.fas' % acc, 'query=%s' % acc, 'basefile=%s' % acc, 'newlog=F' ] self.printLog( '#HAQ', 'Running HAQESAC for %s - will have own log etc.' % seq.shortName(), log=False) os.chdir(self.info['HaqDir']) info = haqesac.makeInfo() haqcmd = rje.getCmdList(haqcmd, info=info) out = rje.Out( cmd_list=haqcmd ) # Sets up Out object for controlling output to screen out.printIntro( info ) # Prints intro text using details from Info object haqlog = rje.setLog( info, out, haqcmd ) # Sets up Log object for controlling log file output try: haqesac.HAQESAC(log=haqlog, cmd_list=haqcmd).run(setobjects=True) except: os.chdir(self.info['RunPath']) if self.i() >= 0 and rje.yesNo( 'Problem with %s HAQESAC run. Abort?' % seq.shortName()): raise KeyboardInterrupt os.chdir(self.info['RunPath']) if finalrun: self.printLog( '#HAQ', 'HAQESAC final round run for %s' % seq.shortName()) else: self.printLog( '#HAQ', 'HAQESAC first round run for %s' % seq.shortName()) ## ~ [1e] Update ScreenQry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not self.opt['ScreenQry'] or not finalrun: continue qacclist = [] for qacc in rje_seq.SeqList( self.log, ['seqin=%s' % infile, 'autoload=T', 'autofilter=F' ]).accList(): if qacc in qryacc and qacc != acc: qacclist.append(qacc) if qacc in qryacc and qacc not in processed: processed.append(qacc) self.printLog( '#QRY', '%d other queries found in %s: [%s]' % (len(qacclist), infile, string.join(qacclist, '; '))) self.printLog( '#QRY', '%d of %d queries processed' % (len(processed), self.seqNum())) ### ~ [2] MultiHAQ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not finalrun: self.printLog('#MULTI', 'Executing second round of multiHAQ') self.multiHAQ(True) except: self.errorLog('Major problem with MultiHAQ.multiHAQ', quitchoice=True)
def qsub(self): ### Creates job and calls with qsub '''Creates job and calls with qsub. Returns qsub job ID or 0 if jobwait=True and job completed.''' try: ### Basics ### hr = int(self.stat['Walltime']) min = int((0.5 + (self.stat['Walltime'] - hr) * 60.0)) if self.opt['Report']: return self.report() jobstr = string.replace('%s.job' % self.info['Job'], '.job', '') jlist = [ '#!/bin/bash', '#PBS -N %s' % jobstr, #,'#PBS -q batch', '#PBS -l nodes=%d:ppn=%d' % (self.stat['Nodes'], self.stat['PPN']), '#PBS -l walltime=%d:%s:00' % (hr, rje.preZero(min, 60)), '#PBS -l vmem=%dgb' % self.getInt('VMem'), '#PBS -l mem=%dgb' % self.getInt('VMem'), '' ] #10 #if not os.popen('hostname').read().startswith('katana.science.unsw.edu.au'): # jlist[-2] = '#PBS -l mem=%dgb' % self.getInt('VMem') if self.getBool('Monitor'): if self.getBool('JobWait'): self.warnLog( 'Cannot run with wait=T and monitor=T: switched monitor=F' ) self.setBool({'Monitor': False}) else: jlist += ['#PBS -k oed'] if self.getStr('Email'): jlist += ['#PBS -M %s' % self.getStr('Email'), '#PBS -m ae'] if self.getBool('MailStart'): jlist[-1] = '#PBS -m bae' jlist += [ '### Define number of processors', 'NPROCS=`wc -l < $PBS_NODEFILE`', 'echo Running on host `hostname`', 'echo Time is `date`', 'echo Directory is `pwd`', #2 'echo This jobs runs on the following processors:', 'echo `cat $PBS_NODEFILE`', '', #5 'echo This job has allocated $NPROCS cpus', '' ] self.printLog( '#PPN', '%d Node(s) requested: %d PPN.' % (self.getInt('Nodes'), self.getInt('PPN'))) self.printLog('#VMEM', '%s GB VMem requested.' % (self.getStat('VMem'))) if self.getBool('ModPurge'): jlist.append('module purge') self.printLog('#MOD', 'Modules purged (modpurge=T)') for mod in self.list['Modules']: if mod.lower() not in ['', 'none']: jlist.append('module add %s' % mod) if self.list['Modules']: self.printLog( '#MOD', 'Modules added: %s' % string.join(self.list['Modules'], '; ')) for pcall in self.list['PreCall']: self.printLog('#PCALL', pcall) jlist.append(pcall) #x#jlist = ['#!/bin/sh'] # New Iridis shell script method! ### Directory & Program ### jlist.append('cd %s' % self.info['QPath']) pcall = self.info['Program'] if self.opt['RjePy']: pcall = 'python ' + self.info['PyPath'] + pcall jlist.append(pcall) ### Completion message jlist += ['', 'echo ---', 'qstat -f $PBS_JOBID', 'echo ---'] jlist += ['', 'echo', 'echo Time is `date`', 'echo Job complete'] ### Output and call ### job = '{0}.job'.format( jobstr ) #string.replace('%s.job' % self.info['Job'],'.job.job','.job') open(job, 'w').write(string.join(jlist, '\n')) self.printLog('#DIR', self.info['QPath']) self.printLog('#RUN', pcall) #qsub = 'qsub %s -S /bin/sh -l walltime=%d:%d:00,nodes=%d:ppn=2' % (job,hr,min,self.stat['Nodes']) qsub = 'qsub' if self.getBool('StartBash'): qsub += ' -S /bin/bash' if self.list['Depend']: qsub += ' -W depend=afterany' #for id in self.list['Depend']: qsub += ':%s.bio-server' % id myhost = self.getStr('DependHPC') if not self.getStrLC('DependHPC'): myhost = string.split(os.popen('hostname').read())[0] for id in self.list['Depend']: qsub += ':%s.%s' % (id, myhost) qsub += ' %s' % (job) self.printLog('#JOB', qsub) if self.test(): self.printLog('#TEST', 'Test mode: will not place job in queue.') self.verbose( 0, 1, string.join(['>>>>>'] + jlist + ['<<<<<', ''], '\n')) return False qrun = os.popen(qsub).read() self.printLog('#QSUB', qrun) qid = string.split(qrun, '.')[0] showstart = 'qstat -T' if os.popen('hostname').read().startswith( 'katana.science.unsw.edu.au'): showstart = 'showstart' self.printLog('#SHOW', 'Attempt %s %s in %s sec' % (showstart, qrun, self.stat['Pause']), log=False) time.sleep(self.stat['Pause']) for qline in os.popen('%s %s' % (showstart, qrun)): #qid): if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False) ### Wait for job to be completed if self.getBool('JobWait'): if self.getBool('Monitor'): raise ValueError('Cannot run with wait=T and monitor=T') self.printLog('#WAIT', 'Waiting for job {0} to finish'.format(qid)) ofile = '{0}.o{1}'.format( string.replace('%s.job' % self.info['Job'], '.job', ''), qid) running = False while not rje.exists(ofile): qstat = string.atoi( os.popen("qstat | grep '^{0}' -c".format( qid)).read().split()[0]) if not qstat: self.printLog( '#QSTAT', 'Job {0} disappeared from qstat'.format(qid)) break elif not running: try: qstat = string.split( os.popen("qstat | grep '^{0}'".format( qid)).read().split()[4]) if qstat == 'R': running = True self.printLog('#QSTAT', 'Job {0} running...'.format(qid)) except: pass time.sleep(max(1, self.getInt('Pause'))) owait = 300 while owait and not rje.exists(ofile): owait -= 1 time.sleep(1) if rje.exists(ofile): if 'Job complete' in os.popen( 'tail -n 1 {0}'.format(ofile)).read(): self.printLog( '#DONE', '{0} job ({1}) complete.'.format(jobstr, qid)) return 0 else: self.printLog( '#FAIL', '{0} job ({1}) failed to finish.'.format( jobstr, qid)) return qid else: self.printLog( '#FAIL', '{0} job ({1}) failed to generate {2}.'.format( jobstr, qid, ofile)) return qid except: self.errorLog('Error in qsub()') return False
def _positiveAndNegativePeptides(self): ### Populates PosPep and NegPep Lists '''Populates PosPep and NegPep Lists.''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pfile = '%s.peptides.tdt' % self.basefile() #if rje.exists(pfile) and not self.getBool('Force'): # try: # pdb = self.db().addTable(pfile,['Peptide'],name='Peptides') # pdb.dataFormat(reformat={'Len':'int','MWt':'num','Cys':'int','Ser':'int','Hyd':'num'}) # self.list['Peptides'] = self.list['PosPep'] = pdb.index('Pos')['Y'] # self.list['NegPep'] = pdb.index('Positive')['Neg'] # return pdb # except: pass if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False self.list['Peptides'] = peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True) seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T']) pdb = self.db().addEmptyTable('Peptides',['Peptide','NR','Pos','Len','MWt','C','HPW','DENQ','M','Hyd'],['Peptide']) ### ~ [1] ~ Digest Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### protease = self.getStr('PepCut') self.list['PosPep'] = poslist = []; self.list['NegPep'] = neglist = []; sx = 0.0; stot = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0 sequence = seq.getSequence() for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) frag = string.split(sequence,':') while '' in frag: frag.remove('') if not self.getBool('NTerm'): frag = frag[1:] for pep in frag[0:]: if pep not in poslist: poslist.append(pep) self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist))) ## ~ [1b] ~ Peptide Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## allpep = []; self.list['Redundant'] = redundant = [] sx = 0.0; stot = self.obj['SeqList'].seqNum() for seq in self.obj['SeqList'].seqs(): self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (protease,sx/stot)); sx += 100.0 sequence = seq.getSequence() for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) for frag in string.split(sequence,':'): if frag in allpep: redundant.append(frag) else: allpep.append(frag) self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (protease,rje.iStr(stot))) ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## px = 0.0; ptot = len(poslist) for pep in poslist[0:]: self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0 entry = {'Peptide':pep,'MWt':rje_sequence.MWt(pep),'Hyd':rje_sequence.eisenbergHydropathy(pep,returnlist=False), 'Len':len(pep),'NR':'Y','Pos':'Y'} if pep not in peplist: poslist.remove(pep); neglist.append(pep); entry['Pos'] = 'N' if pep in redundant: entry['NR'] = 'N' for aacomb in ['C','HPW','DENQ','M']: x = 0 for a in aacomb: x += pep.count(a) entry[aacomb] = x pdb.addEntry(entry) self.printLog('\r#PEP','Processing positive peptides (%s) complete: %s Pos; %s Neg.' % (protease,rje.iLen(poslist),rje.iLen(neglist))) ### ~ [2] ~ Save Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdb.saveToFile(pfile) POS = open('%s.positives.fas' % self.basefile(),'w'); NEG = open('%s.negatives.fas' % self.basefile(),'w') for pep in poslist: POS.write('>%s\n%s\n' % (pep,pep)) for pep in neglist: NEG.write('>%s\n%s\n' % (pep,pep)) POS.close(); self.printLog('#FAS','%s peptides output to %s.positives.fas' % (rje.iLen(poslist),self.basefile())) NEG.close(); self.printLog('#FAS','%s peptides output to %s.negatives.fas' % (rje.iLen(neglist),self.basefile())) return pdb except: self.errorLog('Problem during %s._positiveAndNegativePeptides().' % self); return None # Setup failed
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.obj['DB'] = rje_db.Database(self.log,self.cmd_list) seqcmd = self.cmd_list + ['autoload=T','seqmode=file','seqindex=T'] dfile = '%s.data.tdt' % self.basefile() ### ~ [2] Load Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['SeqList']['full'] = rje_seqlist.SeqList(self.log,seqcmd + ['seqmode=list']) self.debug(self.dict['SeqList']['full'].seqNum()) if self.dict['SeqList']['full'].seqNum(): return self.dict['SeqList']['full'] = rje_seqlist.SeqList(self.log,seqcmd + ['seqin=%s.full.fas' % (self.basefile()),'seqmode=list']) for stype in ['CDS','gene','prot']: seq = self.dict['SeqList'][stype] = rje_seqlist.SeqList(self.log,seqcmd + ['seqin=%s.%s.fas' % (self.basefile(),stype)]) seq.dict['SeqDict'] = {} for s in seq.list['Seq']: (name,sequence) = seq.getSeq(s) seq.dict['SeqDict'][string.split(string.split(name)[0],'_')[-1]] = s ### ~ [3] Database Compilation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if rje.exists(dfile) and not self.getBool('Force'): db.addTable(dfile,name='data',mainkeys=['tag']) else: ## ~ [3a] ~ Load part tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## fdb = db.addTable('%s.function.tdt' % self.basefile(),name='function',mainkeys=['tag']) fdb.dropField('description') edb = db.addTable('%s.expression.tdt' % self.basefile(),name='expression',mainkeys=['key']) nx = 0 edb.fillBlanks(blank='0',fillempty=True) for ekey in rje.sortKeys(edb.data()): entry = edb.data(ekey) for field in edb.fields(): if entry[field] == 'na': entry[field] = '0.0'; nx += 1 self.printLog('#TDT','Updated %s entries for expression table' % rje.iStr(nx)) kdb = db.addTable('%s.proteinkey.tdt' % self.basefile(),name='proteinkey',mainkeys=['key']) xdb = db.addTable('%s.dbxref.tdt' % self.basefile(),name='dbxref',mainkeys=['tag']) xdb.dropField('gene') # Pull from genbank instead #pdb = db.addTable('%s.cysweight.tdt' % self.basefile(),name='cysweight',mainkeys=['AccNum']) pdb = db.addTable('%s.protein.tdt' % self.basefile(),name='prodigis',mainkeys=['AccNum']) pdb.addField('NRPep5','NRPep',0); pdb.addField('NRPep7','NRPep5',0) for x in range(5,51): xfield = '%d' % x if xfield not in pdb.fields(): continue for entry in pdb.entries(): entry['NRPep5'] += int(entry[xfield]) if x >= 7: entry['NRPep7'] += int(entry[xfield]) for field in pdb.fields()[0:]: if field not in ['AccNum','File','ProtMWt','PepCount','LenExp','Len3','Len5','Len7Exp','Len37','NRPep','NRPep5','NRPep7','Cys0']: pdb.dropField(field) #pdb.renameField('AccNum','uniprot') #pdb.newKey(['uniprot']) pdb.renameField('AccNum','tag') pdb.newKey(['tag']) mdb = db.addTable('%s.PNASmaintable.tdt' % self.basefile(),name='main',mainkeys=['tag']) tdb = db.addTable('%s.tmhmm.tdt' % self.basefile(),name='TMHMM',mainkeys=['acc_num']) ## ~ [3b] ~ Load and process features table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gdb = db.addTable('%s.Feature.tdt' % self.basefile(),name='feature',mainkeys=['locus','feature','position']) gdb.dropEntriesDirect('feature',['CDS'],inverse=True) gdb.list['Fields'] += ['tag','start','end','gene','product'] for entry in gdb.entries(): pos = rje.matchExp('(\d+)\.\.(\d+)',entry['position']) if entry['position'][:4] == 'comp': entry['start'] = pos[1]; entry['end'] = pos[0] else: entry['start'] = pos[0]; entry['end'] = pos[1] try: entry['tag'] = rje.matchExp('locus_tag="(\S+)"',entry['details'])[0] except: entry['tag'] = '-' try: entry['gene'] = rje.matchExp('gene="(\S+)"',entry['details'])[0] except: entry['gene'] = '' try: entry['product'] = string.split(string.split(entry['details'],'/product="')[1],'"')[0] except: entry['product'] = '' gdb.dropEntriesDirect('tag',['-']) gdb.newKey(['tag']) for field in ['locus','feature','position','details']: gdb.dropField(field) ## ~ [3c] ~ Codon Bias Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cfile = '%s.CDS.Bias.tdt' % self.basefile() if not rje.exists(cfile) or self.getBool('Force'): rje_codons.Codons(self.log,self.cmd_list+['seqin=%s.CDS.fas' % self.basefile(),'backups=F']).run() bdb = db.addTable(cfile,name='Bias',mainkeys=['Seq']) bdb.renameField('Len','AALen') ndb = db.addTable('%s.CDS.NT.tdt' % self.basefile(),name='NT',mainkeys=['Seq']) ndb.renameField('Len','NTLen') for field in ndb.fields(): if field != string.replace(field,'U','T'): ndb.renameField(field,string.replace(field,'U','T')) ## ~ [3d] ~ Join tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## temp = db.joinTables(name='temp',join=[(edb,'key'),(kdb,'key')],newkey=['key'],cleanup=True,keeptable=True) #pfields = pdb.fields()[0:] #pfields.remove('uniprot') #temp2 = db.joinTables(name='temp2',join=[(xdb,'uniprot'),(pdb,'uniprot',pfields)],newkey=['tag'],cleanup=True,keeptable=True) #data = db.joinTables(name='data',join=[(temp2,'tag'),(fdb,'tag'),(gdb,'tag'),(bdb,'Seq'),(ndb,'Seq'),(temp,'tag'),(mdb,'tag')],newkey=['tag'],cleanup=True,keeptable=True) data = db.joinTables(name='data',join=[(pdb,'tag'),(xdb,'tag'),(fdb,'tag'),(tdb,'acc_num'),(gdb,'tag'),(bdb,'Seq'),(ndb,'Seq'),(temp,'tag'),(mdb,'tag')],newkey=['tag'],cleanup=True,keeptable=True) data.dropField('Seq') ## ~ [3e] ~ Fill out data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## data.fillBlanks(blank='0.0',fields=['eb','rb'],fillempty=True) #for entry in data.entries(): # if entry['tag'] not in self.dict['SeqList']['CDS'].dict['SeqDict']: entry['function'] = 'Non-CDS' data.fillBlanks(blank='Unassigned',fields=['function'],fillempty=True) data.fillBlanks() data.fillBlanks(blank='no mapping',fields=['description'],fillempty=True) data.saveToFile(dfile) allfields = data.list['Fields'][0:] data.list['Fields'] = ["tag","File","PepCount","LenExp","Len3","Len5","Len7Exp","Len37","NRPep",'NRPep5','NRPep7',"Cys0", "pi","mass","function","new_function","tm","start","end","AALen","Bias", "WtBias","AbsBias",'NTLen','C','A','G','T','C|3','A|3','G|3','T|3', 'eb_1.1','eb_1.2','eb_2.1','eb_2.2','rb_1.1','rb_1.2','rb_2.1','rb_2.2','eb','rb'] data.saveToFile('%s.cutdata.tdt' % self.basefile()) data.list['Fields'] = allfields return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def sgd2sp( self ): ### Reformats yeast sequence names and outputs new data for GOPHER '''Reformats yeast sequence names and outputs new data for GOPHER.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### inseq = self.obj['SeqList'] uni = rje_uniprot.UniProt(self.log, self.cmd_list + ['datout=None']) xref = self.db('XRef') self.dict['Rename'] = {} ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ufile = '%s.dat' % self.info['Basefile'] if os.path.exists(ufile) and not self.opt['Force']: uni.readUniProt(ufile, clear=True, cleardata=False) else: uni.readUniProt(clear=True, acclist=rje.sortKeys(xref.index('UniProt')), cleardata=False) uni.saveUniProt(ufile) ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniseq = {} for entry in uni.entries(): seq = entry.obj['Sequence'] uniseq[seq.info['AccNum']] = seq self.printLog( '\r#USEQ', '%s UniProt Sequences extracted (%s Ensembl AccNum)' % (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt'))))) ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### yseq = [] # List of YEAST sequence objects (sx, stot) = (0.0, inseq.seqNum()) for seq in inseq.seqs(): self.progLog( '\r#SEQ', 'Reformatting sequence names: %.2f%%' % (sx / stot)) sx += 100.0 if seq.info['SpecCode'] != 'YEAST': continue yseq.append(seq) sgd = seq.info['AccNum'] newname = seq.info['Name'] try: for x in xref.indexEntries('EnsG', sgd): acc = x['UniProt'] if acc: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % ( seq.info['Name'], x['Gene'], x['EnsG'], x['SGD'], acc) else: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % ( seq.info['Name'], x['Gene'], x['EnsG'], x['SGD']) continue if acc not in uniseq: self.printLog( '\r#UNIERR', 'Unable to find UniProt sequence %s (%s)' % (acc, sgd)) continue useq = uniseq[acc] if useq.info['Sequence'] != seq.info['Sequence']: self.printLog( '\r#SEQERR', '%s sequence <> %s sequence' % (sgd, acc)) continue nsplit = string.split(newname) nsplit[0] = '%s__%s' % (x['UniprotID'], acc) newname = string.join(nsplit) self.dict['Rename'][sgd] = acc break except: self.errorLog('%s problem' % sgd) seq.info['Name'] = newname seq.extractDetails(gnspacc=True) self.printLog('\r#SEQ', 'Reformatting sequence names complete.') ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.exists('%s.ygob.fas' % self.info['Basefile']): inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile']) if not rje.exists('%s.yeast.fas' % self.info['Basefile']): inseq.saveFasta(seqs=yseq, seqfile='%s.yeast.fas' % self.info['Basefile']) self.list['YeastSeq'] = inseq.accList(yseq) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def _peptideProbabilities(self): ### Read in peptides and positives and calculate probability of return '''Read in peptides and positives and calculate probability of return.''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getBool('CysWeight'): return self._cysteinePeptideProbabilities() self._positiveAndNegativePeptides() #return self.printLog('#NOPROB','Probability calculation temporarily suspended') pfile = '%s.pep_prob.tdt' % self.basefile() if rje.exists(pfile) and not self.getBool('Force'): try: pdb = self.db().addTable(pfile,['PepSize'],name='PepProb') pdb.dataFormat(reformat={'PepSize':'num','Positive':'int','Negative':'int','Prob':'num'}) for entry in pdb.entries(): if entry['PepSize'] < 100: entry['PepSize'] = int(entry['PepSize']) return pdb except: pass pdb = self.db().addEmptyTable('PepProb',['PepSize','Positive','Negative','Prob'],['PepSize']) if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False ## ~ [0a] ~ Load Peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True) ## ~ [0b] ~ Load Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T']) ### ~ [1] ~ Digest Positives and Update PepProb Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### protease = self.getStr('PepCut') ## ~ [1a] ~ Create new database entry to fill with data ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## edict = {} for i in range(1,self.getInt('MaxPepLen')+1): edict[i] = pdb.addEntry({'PepSize':i,'Positive':0,'Negative':0,'Prob':1.0}) if self.getBool('PepMWt'): edict[i*100.0] = pdb.addEntry({'PepSize':i*100.0,'Positive':0,'Negative':0,'Prob':1.0}) ## ~ [1b] ~ For each recognition site of each protease, mark cuts with ":" ~~~~~~~~ ## poslist = []; neglist = []; sx = 0.0; stot = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0 sequence = seq.getSequence() for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) frag = string.split(sequence,':') while '' in frag: frag.remove('') if not self.getBool('NTerm'): frag = frag[1:] for pep in frag[0:]: if self.getBool('NRPep') and pep in self.list['Redundant']: continue if pep not in poslist: poslist.append(pep) self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist))) ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## px = 0.0; ptot = len(poslist) for pep in poslist[0:]: self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0 plen = min(len(pep),self.getInt('MaxPepLen')) if pep in peplist: edict[plen]['Positive'] += 1 else: edict[plen]['Negative'] += 1; poslist.remove(pep); neglist.append(pep) if self.getBool('PepMWt'): pwt = 100.0 * min(int((rje_sequence.MWt(pep)+99)/100.0),self.getInt('MaxPepLen')) if pep in peplist: edict[pwt]['Positive'] += 1 else: edict[pwt]['Negative'] += 1 self.printLog('\r#PEP','Processing positive peptides (%s) complete.' % protease) ## ~ [1d] # Calculate peptide probabilities for protease combo ~~~~~~~~~~~~~~~~~~~~ ## for entry in edict.values(): try: entry['Prob'] = float(entry['Positive']) / float(entry['Positive']+entry['Negative']) except: entry['Prob'] = 0.0 ### ~ [2] ~ Save File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdb.saveToFile(pfile) return pdb except: self.errorLog('Problem during %s._peptideProbabilities().' % self); return None # Setup failed
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### infile = self.getStr('InFile') while not rje.exists(infile): infile = rje.choice( 'File "%s" not found. Input file name? (Blank to quit):' % infile) if not infile: return self.printLog('#QUIT', 'Execution terminated!') db = rje_db.Database(self.log, self.cmd_list) db.basefile(rje.baseFile(infile)) sdb = db.addTable(infile, mainkeys='#', delimit='\t', name='SPF.Mod') levels = { 'Level_1': 'k', 'Level_2': 'p', 'Level_3': 'c', 'Level_4': 'o', 'Level_5': 'f', 'Level_6': 'g', 'Level_7': 's' } # k__Bacteria p__Proteobacteria c__Alphaproteobacteria o__Rhodospirillales f__Rhodospirillaceae g__ s__ denovo44 # Unassigned unclassified unclassified unclassified unclassified unclassified unclassified denovo49 ### ~ [1] Modify Text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dupnames = [] parents = {} # Parent for each term renamed = [] ex = 0.0 etot = sdb.entryNum() for entry in sdb.entries(): self.progLog('\r#SPF', 'Modifying SPF content: %.1f%%' % (ex / etot)) ex += 100.0 taxon = '' parent = '' #self.debug(entry) for lvl in [ 'Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_5', 'Level_6', 'Level_7' ]: entry[lvl] = string.replace(entry[lvl], 'unidentified', 'unclassified') #entry[lvl] = string.replace(entry[lvl],'Incertae_sedis','Incertae_sedis-%s' % levels[lvl]) null = '%s__' % levels[lvl] #self.bugPrint(null) #self.bugPrint(entry[lvl]) if entry[lvl] in [ null, 'Unassigned', 'unclassified', '%sunclassified' % null, '%sunidentified' % null, '%sunculturedfungus' % null, '%sIncertae_sedis' % null, '%sunclassified_sp.' % null ]: if not taxon or taxon.endswith('unclassified'): entry[lvl] = '%sunclassified' % null #elif taxon.endswith('unassigned)'): entry[lvl] = '%s%s' % (null,taxon[3:]) #elif taxon.endswith('unassigned)'): entry[lvl] = '%s(%s;%s-unassigned)' % (null,string.split(taxon,'(')[1][:-1],levels[lvl]) elif taxon.endswith('unassigned)'): entry[lvl] = '%s%s;%s-unassigned)' % ( null, taxon[3:][:-1], levels[lvl]) else: entry[lvl] = '%s%s(%s-unassigned)' % ( null, taxon[3:], levels[lvl]) if entry[lvl] in parents: #self.debug(parents[entry[lvl]]) if parent in parents[entry[lvl]]: entry[lvl] = parents[entry[lvl]][parent] else: self.bugPrint(entry[lvl]) self.bugPrint(parents[entry[lvl]]) renamed.append(entry[lvl]) newtax = '%s%d' % (entry[lvl], renamed.count(entry[lvl])) self.warnLog( '%s had multiple parents (%s & %s) -> %s' % (entry[lvl], string.join(parents[entry[lvl]], '|'), parent, newtax)) parents[newtax] = {parent: newtax} parents[entry[lvl]][parent] = newtax entry[lvl] = newtax self.deBug(parents[entry[lvl]]) elif parent: parents[entry[lvl]] = {parent: entry[lvl]} parent = entry[lvl] if entry[lvl][3:] == taxon[3:]: if (entry[lvl], taxon) not in dupnames: dupnames.append((entry[lvl], taxon)) #self.bugPrint(entry[lvl]) taxon = entry[lvl] #self.debug(entry) #self.debug(parents) self.printLog('\r#SPF', 'Modifying SPF content complete.') dupnames.sort() for (dupA, dupB) in dupnames: self.warnLog('Duplicate taxa names: %s & %s' % (dupA, dupB)) ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb.saveToFile(savefields=sdb.list['Fields'][1:]) ### ~ [3] Compress to different taxonomic levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### compress = [ 'Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_5', 'Level_6', 'Level_7', '#' ] dump = compress.pop(-1) rules = {'Observation Ids': 'list', dump: 'str'} sdb.dropField('Observation Ids') while compress: sdb.compress(compress, rules=rules, default='sum', best=[], joinchar='|') #if dump == '#': sdb.dropField(dump) sdb.saveToFile( '%s.SPF.%s.%s.spf' % (rje.baseFile(infile), compress[-1], levels[compress[-1]])) dump = compress.pop(-1) rules[dump] = 'list' return except: self.errorLog(self.zen()) raise # Delete this if method error not terrible