def restAPI(self): ### Make a rest call and update RestIn with JobID if successful '''Make a rest call and update RestIn with JobID if successful.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### restcall = '%s&rest=jobid' % self.getStr('RestIn') self.printLog('#REST',restcall) refresh = self.getInt('Refresh') ### ~ [1] Set job running ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### jobid = rje.chomp(urllib2.urlopen(restcall).read()) self.printLog('#JOBID',jobid) ### ~ [2] Wait for completion ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### checkurl = '%scheck&jobid=%s' % (self.getStr('RestURL'),jobid) self.printLog('#CHECK',checkurl) check = rje.chomp(urllib2.urlopen(checkurl).read()) while check in ['Queued','Running']: self.progLog('\r#RUN',check) time.sleep(refresh) refresh = min(self.getInt('MaxRefresh'),refresh*2) check = rje.chomp(urllib2.urlopen(checkurl).read()) ### ~ [3] Return JobID if finished ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if check == 'Finished': self.printLog('\r#RUN','REST call complete: restin=%s' % jobid) self.setStr({'RestIn':jobid}) if not self.getStrLC('RestBase'): self.setStr({'RestBase':jobid}) return jobid else: self.printLog('#FAIL','REST check error: %s' % check) except: self.errorLog('%s.restAPI error' % self) return False
def parseOMIM(self): ### Main parsing method '''Main parsing method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Records'] = {} self.dict['Mutations'] = {} aas = string.split(string.join(rje_sequence.aa_code_3.values()).upper()) oline = os.path.exists(self.info['Name']) (olen,ox,mx) = (len(open(self.info['Name'],'r').readlines()),0.0,0) OMIM = open(self.info['Name'],'r') ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### record = gene = subid = disease = mutation = '' av = False # Whether reading *FIELD* AV for mutation data while oline: oline = OMIM.readline() self.log.printLog('\r#OMIM','Processing OMIM: %.2f%% (%s genes)' % (ox/olen,rje.integerString(len(self.dict['Records']))),newline=False,log=False) ox += 100.0 if not av and oline[:1] != '*': continue line = rje.chomp(oline) while line[-1:] == ' ': line = line[:-1] ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if line == '*RECORD*': (record,av) = ('',False) elif line == '*FIELD* NO': # New record record = rje.chomp(OMIM.readline()) gene = '' ox += 100.0 ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* TI': # New gene gene = string.split(rje.chomp(OMIM.readline()))[-1] subid = '' av = False ox += 100.0 ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* AV': av = True # Start of mutation records elif av and rje.matchExp('^(\.\d+)',line): # New subid mutation record subid = rje.matchExp('^(\.\d+)',line)[0] disease = rje.chomp(OMIM.readline()) ox += 100.0 try: mutation = rje.matchExp('^%s, (\D\D\D\d+\D\D\D)' % gene,rje.chomp(OMIM.readline()))[0] except: continue # No mutation or not coding change ox += 100.0 subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)',mutation) if subaa[0] not in aas or subaa[1] not in aas: continue if gene not in self.dict['Records']: self.dict['Records'][gene] = [record] if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record] if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {} mx += 1 self.dict['Mutations'][gene][subid] = (disease,mutation) ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### OMIM.close() self.log.printLog('\r#OMIM','Processing OMIM complete! (%s genes; %s mutations)' % (rje.integerString(len(self.dict['Records'])),rje.integerString(mx))) self.saveMutations() except: self.log.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Protein descriptions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['ProtDesc'] = {} if self.getStrLC('ProtDesc'): for fline in open(self.getStr('ProtDesc'),'r').readlines(): [prot,desc] = string.split(rje.chomp(fline),maxsplit=1) self.dict['ProtDesc'][prot] = desc #self.db().addTable(self.getStr('ProtDesc'),mainkeys=['protein'],datakeys='All',headers=['protein','description'],ignore=['#'],name='protdesc',expect=True) ## ~ [0b] Look for previous run results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## taxdb = self.db('taxa',add=True,forcecheck=True,mainkeys=['spcode']) if not taxdb and self.getStrLC('TaxBase') and not self.force(): spfile = '%s.taxa.tdt' % self.getStr('TaxBase') taxdb = db.addTable(spfile,mainkeys=['spcode'],name='taxa',expect=False) mapdb = self.db('taxamap',add=True,forcecheck=True,mainkeys=['protein']) if not mapdb and self.getStrLC('TaxBase') and not self.force(): spfile = '%s.taxamap.tdt' % self.getStr('TaxBase') mapdb = db.addTable(spfile,mainkeys=['protein'],name='taxamap',expect=False) if taxdb and mapdb: taxdb.dataFormat({'boot':'num'}) mapdb.dataFormat({'boot':'num'}) return True ## ~ [0c] Taxonomy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.obj['Taxonomy'] = rje_taxonomy.Taxonomy(self.log,self.cmd_list) self.obj['Taxonomy'].setup(force=False) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()); return False # Setup failed
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Check and modify URL if required ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.getStr('RestIn').startswith('http:'): #!# Check for rest URL and add if missing #!# Split on & restcmd = string.split(self.getStr('RestIn'),'&') for i in range(len(restcmd)): if '=' not in restcmd[i]: continue (opt,value) = string.split(restcmd[i],'=',1) if value.startswith('file:'): # Conversion of cmd=file:FILE into cmd=CONTENT rfile = string.split(value,':',1)[1] #!# Consider adding max size constraint. Probably a URL size limit. if rje.exists(rfile): restcmd[i] = '%s=%s' % (opt,rje.chomp(string.join(open(rfile,'r').readlines(),'\\n'))) if '&' in restcmd[i]: self.warnLog('%s "&" => "+" conversions for %s.' % (rje.iStr(restcmd[i].count('&')),rfile)) restcmd[i] = string.replace(restcmd[i],'&','+') else: self.warnLog('File "%s" not found.' % rfile,quitchoice=True) self.setStr({'RestIn':string.join(restcmd,'&')}) ## ~ [1b] Direct Parsing of output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: # Convert to file self.setStr({'RestIn':rje.makePath(self.getStr('RestIn'),True)}) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def report(self): ### Run qstat to get job list then showstart on each job '''Run qstat to get job list then showstart on each job .''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qidlist = [] qidjob = {} ### ~ [2] ~ Read in List of IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for qline in os.popen('qstat'): try: (qid, job) = rje.matchExp('^(\d+)\.\S+\s+(\S+)', qline) qidlist.append(qid) qidjob[qid] = job except: continue ### ~ [3] ~ Report ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#QSTAT', '%d jobs in queue.' % len(qidlist)) for qid in qidlist: self.printLog('#JOB', '%s = %s' % (qid, qidjob[qid]), timeout=False) for qline in os.popen('showstart %s' % qid): if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False) self.printLog('#ZEN', rje_zen.Zen().wisdom()) except: self.errorLog('QSub.report problem')
def classify(self): ### Generate summary tables for each protein class '''Generate summary tables for each protein class.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() rankdb = self.db('taxamap') for cfile in self.list['Classify']: pclass = rje.baseFile(cfile, strip_path=True) clist = [] for fline in open(cfile, 'r').readlines(): prot = string.split(rje.chomp(fline), maxsplit=1)[0] if prot: clist.append(prot) self.printLog( '#CLASS', '%s "%s" class proteins read from %s' % (rje.iLen(clist), pclass, cfile)) if not clist: self.warnLog('No proteins read from %s' % (cfile)) continue classdb = db.copyTable(rankdb, pclass) classdb.dropEntriesDirect('protein', clist, inverse=True) if not classdb.entries(): self.warnLog('No "%s" proteins found in TaxaMap table' % (pclass)) continue self.summaryScores(classdb, pclass, 'MinClass') except: self.errorLog('%s.classify() error' % self.prog())
def readAAProp(self, filename=None): ### Reads AA Property Matrix from file ''' Reads AA Property Matrix from file. >> filename:str = Filename. If None, will use self.info['Name'] ''' try: ### <a> ### Load and read if filename: self.info['Name'] = filename else: filename = self.info['Name'] readtxt = 'Reading AA Properties from %s...' % filename self.progLog('\r#AAPROP', readtxt) proplines = self.loadFromFile(filename, v=2) ### <b> ### Process self.alphabet = [] self.prop = {} ## <i> ## Properties and alphabet for line in proplines: line = rje.chomp(line) if line.find('#') == 0: # Comment line continue elif line.find('PROP') == 0: # Header line - has amino acids line = rje.matchExp('^\S+(\s.+)', line)[0] while re.search('^\s+\S.*', line): (aa, line) = rje.matchExp('^\s+(\S)(.*)', line) self.alphabet.append(aa) readtxt += ' ...%s' % string.join(self.alphabet) self.progLog('\r#AAPROP', readtxt) elif re.search('^\S', line) and self.alphabet: # Property line (aaproperty, line) = rje.matchExp('^(\S+)(\s.+)', line) readtxt += ' ...%s' % aaproperty self.progLog('\r#AAPROP', readtxt) self.prop[aaproperty] = {} for aa in self.alphabet: (p, line) = rje.matchExp('^\s+(\S)(.*)', line) self.prop[aaproperty][aa] = p #self.verbose(2,3,'...%s' % self.prop[property],0) readtxt += ' ...Done!' self.printLog('\r#AAPROP', readtxt) except IOError: self.log.errorLog( 'AA Property matrix file %s missing?' % self.info['Name'], True) raise except: self.log.errorLog( 'Major Problem reading AA Property matrix(%s)' % self.info['Name'], True) return add = [] if 'X' not in self.alphabet: add.append('X') if '-' not in self.alphabet: add.append('-') if add: add = self.alphabet + add self.useAlphabet(alphabet=add) self.makePropDif()
def qsub(self): ### Creates job and calls with qsub '''Creates job and calls with qsub.''' try:### Basics ### hr = int(self.stat['Walltime']) min = int((0.5+(self.stat['Walltime'] - hr)*60.0)) if self.opt['Report']: return self.report() jlist = ['#!/bin/bash', '#PBS -N %s' % string.replace('%s.job' % self.info['Job'],'.job',''), #,'#PBS -q batch', '#PBS -l nodes=%d:ppn=%d' % (self.stat['Nodes'],self.stat['PPN']), '#PBS -l walltime=%d:%d:00' % (hr,min),'#PBS -l vmem=%dgb' % self.getInt('VMem'),''] #10 if self.getStr('Email'): jlist += ['#PBS -M %s' % self.getStr('Email'),'#PBS -m ae'] if self.getBool('MailStart'): jlist[-1] = '#PBS -m bae' jlist += ['### Define number of processors','NPROCS=`wc -l < $PBS_NODEFILE`', 'echo Running on host `hostname`','echo Time is `date`','echo Directory is `pwd`', #2 'echo This jobs runs on the following processors:','echo `cat $PBS_NODEFILE`','', #5 'echo This job has allocated $NPROCS cpus',''] self.printLog('#PPN','%d Node(s) requested: %d PPN.' % (self.getInt('Nodes'),self.getInt('PPN'))) self.printLog('#VMEM','%s GB VMem requested.' % (self.getStat('VMem'))) if self.getBool('ModPurge'): jlist.append('module purge') self.printLog('#MOD','Modules purged (modpurge=T)') for mod in self.list['Modules']: if mod.lower() not in ['','none']: jlist.append('module add %s' % mod) if self.list['Modules']: self.printLog('#MOD','Modules added: %s' % string.join(self.list['Modules'],'; ')) for pcall in self.list['PreCall']: self.printLog('#PCALL',pcall) jlist.append(pcall) #x#jlist = ['#!/bin/sh'] # New Iridis shell script method! ### Directory & Program ### jlist.append('cd %s' % self.info['QPath']) pcall = self.info['Program'] if self.opt['RjePy']: pcall = 'python ' + self.info['PyPath'] + pcall jlist.append(pcall) ### Output and call ### job = string.replace('%s.job' % self.info['Job'],'.job.job','.job') open(job,'w').write(string.join(jlist,'\n')) self.printLog('#DIR',self.info['QPath']) self.printLog('#RUN',jlist[-1]) #qsub = 'qsub %s -S /bin/sh -l walltime=%d:%d:00,nodes=%d:ppn=2' % (job,hr,min,self.stat['Nodes']) qsub = 'qsub %s -S /bin/bash' % (job) if self.list['Depend']: qsub += ' -W depend=afterany' #for id in self.list['Depend']: qsub += ':%s.bio-server' % id for id in self.list['Depend']: qsub += ':%s.%s' % (id,self.getStr('DependHPC')) self.printLog('#JOB',qsub) if self.test(): self.printLog('#TEST','Test mode: will not place job in queue.') self.verbose(0,1,string.join(['>>>>>']+jlist+['<<<<<',''],'\n')) return False qrun = os.popen(qsub).read() self.printLog('#QSUB',qrun) qid = string.split(qrun,'.')[0] self.printLog('#SHOW','Attempt showstart %s in %s sec' % (qid,self.stat['Pause']),log=False) time.sleep(self.stat['Pause']) for qline in os.popen('showstart %s' % qrun): #qid): if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False) return qid except: self.errorLog('Error in qsub()'); return False
def parseTMHMM(tmline): ### Returns a dictionary of TMHMM data from a TMHMM line '''Returns a dictionary of TMHMM data from a TMHMM line.''' tmdata = string.split(rje.chomp(tmline)) tmdict = {'Seq':tmdata.pop(0)} for tm in tmdata: (tkey,tval) = string.split(tm,'=') tmdict[tkey] = tval return tmdict
def readAAProp(self,filename=None): ### Reads AA Property Matrix from file ''' Reads AA Property Matrix from file. >> filename:str = Filename. If None, will use self.info['Name'] ''' try: ### <a> ### Load and read if filename: self.info['Name'] = filename else: filename = self.info['Name'] readtxt = 'Reading AA Properties from %s...' % filename self.progLog('\r#AAPROP',readtxt) proplines = self.loadFromFile(filename,v=2) ### <b> ### Process self.alphabet = [] self.prop = {} ## <i> ## Properties and alphabet for line in proplines: line = rje.chomp(line) if line.find('#') == 0: # Comment line continue elif line.find('PROP') == 0: # Header line - has amino acids line = rje.matchExp('^\S+(\s.+)',line)[0] while re.search('^\s+\S.*',line): (aa,line) = rje.matchExp('^\s+(\S)(.*)',line) self.alphabet.append(aa) readtxt += ' ...%s' % string.join(self.alphabet) self.progLog('\r#AAPROP',readtxt) elif re.search('^\S',line) and self.alphabet: # Property line (aaproperty,line) = rje.matchExp('^(\S+)(\s.+)',line) readtxt += ' ...%s' % aaproperty self.progLog('\r#AAPROP',readtxt) self.prop[aaproperty] = {} for aa in self.alphabet: (p,line) = rje.matchExp('^\s+(\S)(.*)',line) self.prop[aaproperty][aa] = p #self.verbose(2,3,'...%s' % self.prop[property],0) readtxt += ' ...Done!' self.printLog('\r#AAPROP',readtxt) except IOError: self.log.errorLog('AA Property matrix file %s missing?' % self.info['Name'],True) raise except: self.log.errorLog('Major Problem reading AA Property matrix(%s)' % self.info['Name'],True) return add = [] if 'X' not in self.alphabet: add.append('X') if '-' not in self.alphabet: add.append('-') if add: add = self.alphabet + add self.useAlphabet(alphabet=add) self.makePropDif()
def loadPPI(self): ### Load pairwise interaction data '''Load pairwise interaction data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not rje.checkForFile(self.info['PPIFile']): return False ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in open(self.info['PPIFile'],'r').readlines(): try: [pa,pb] = string.split(rje.chomp(line))[:2] except: continue for ppi in [(pa,pb),(pb,pa)]: if ppi[0] not in self.dict['PPI']: self.dict['PPI'][ppi[0]] = [] if ppi[1] not in self.dict['PPI'][ppi[0]]: self.dict['PPI'][ppi[0]].append(ppi[1]) self.progLog('\r#PPI','Loading PPI data: %s proteins' % rje.integerString(len(self.dict['PPI']))) self.printLog('\r#PPI','Loaded PPI data for %s proteins' % rje.integerString(len(self.dict['PPI']))) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Protein descriptions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['ProtDesc'] = {} if self.getStrLC('ProtDesc'): for fline in open(self.getStr('ProtDesc'), 'r').readlines(): [prot, desc] = string.split(rje.chomp(fline), maxsplit=1) self.dict['ProtDesc'][prot] = desc #self.db().addTable(self.getStr('ProtDesc'),mainkeys=['protein'],datakeys='All',headers=['protein','description'],ignore=['#'],name='protdesc',expect=True) ## ~ [0b] Look for previous run results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## taxdb = self.db('taxa', add=True, forcecheck=True, mainkeys=['spcode']) if not taxdb and self.getStrLC('TaxBase') and not self.force(): spfile = '%s.taxa.tdt' % self.getStr('TaxBase') taxdb = db.addTable(spfile, mainkeys=['spcode'], name='taxa', expect=False) mapdb = self.db('taxamap', add=True, forcecheck=True, mainkeys=['protein']) if not mapdb and self.getStrLC('TaxBase') and not self.force(): spfile = '%s.taxamap.tdt' % self.getStr('TaxBase') mapdb = db.addTable(spfile, mainkeys=['protein'], name='taxamap', expect=False) if taxdb and mapdb: taxdb.dataFormat({'boot': 'num'}) mapdb.dataFormat({'boot': 'num'}) return True ## ~ [0c] Taxonomy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.obj['Taxonomy'] = rje_taxonomy.Taxonomy( self.log, self.cmd_list) self.obj['Taxonomy'].setup(force=False) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()) return False # Setup failed
def report(self): ### Run qstat to get job list then showstart on each job '''Run qstat to get job list then showstart on each job .''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qidlist = [] qidjob = {} ### ~ [2] ~ Read in List of IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for qline in os.popen('qstat'): try: (qid,job) = rje.matchExp('^(\d+)\.\S+\s+(\S+)',qline) qidlist.append(qid) qidjob[qid] = job except: continue ### ~ [3] ~ Report ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#QSTAT','%d jobs in queue.' % len(qidlist)) for qid in qidlist: self.printLog('#JOB', '%s = %s' % (qid,qidjob[qid]), timeout=False) for qline in os.popen('showstart %s' % qid): if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False) self.printLog('#ZEN',rje_zen.Zen().wisdom()) except: self.errorLog('QSub.report problem')
def mapEnsGO(self,spec='HUMAN',gokey='EnsGO',fixhead=True): ### Extracts EnsEMBL GO mapping data from a BioMart download '''Extracts EnsEMBL GO mapping data from a BioMart download.''' ### ~ [1] ~ Setup paths and files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if gokey not in self.dict: self.dict[gokey] = {} ensmap = [] for gtype in ['GO','GO.BP','GO.CC','GO.MF']: gfile = self.info['EnsGOPath'] + 'ens_%s.%s.tdt' % (spec,gtype) if os.path.exists(gfile): ensmap.append(gfile) if not ensmap: self.errorLog('EnsEMBL-GO mapping file (%s) missing' % self.info['EnsGOPath'],printerror=False) return False ### ~ [2] ~ Parse Gene-GO Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mainkeys = ['Ensembl Gene ID','GO ID'] for gfile in ensmap: if fixhead: headers = string.split(rje.chomp(open(gfile,'r').readlines()[0]),'\t') if 'Ensembl Gene ID' in headers: mainkeys = ['Ensembl Gene ID'] else: mainkeys = headers[:1] if 'GO Term Accession' in headers: mainkeys.append('GO Term Accession') elif 'GO Term Accession (bp)' in headers: mainkeys.append('GO Term Accession (bp)') elif 'GO Term Accession (mf)' in headers: mainkeys.append('GO Term Accession (mf)') elif 'GO Term Accession (cc)' in headers: mainkeys.append('GO Term Accession (cc)') elif 'GO ID' in headers: mainkeys.append('GO ID') else: mainkeys.append(headers[2]) self.printLog('#HEAD','%s' % (string.join(mainkeys,' / '))) self.progLog('\r#GO','Mapping EnsEMBL GO...') ensdata = rje.dataDict(self,gfile,mainkeys) (mx,mtot) = (0.0,len(ensdata)) obselete_go = [] for map in ensdata: self.progLog('\r#GO','Mapping EnsEMBL GO: %.2f%%' % (mx/mtot)); mx += 100.0 try: (gene,go) = string.split(map) except: continue # no GO! ## Update dictionaries ## if go[:3] == 'GO:': go = go[3:] if go in self.go(): self.addGeneGO(gene,go,gokey) elif go in self.dict['AltID']: for id in self.dict['AltID'][go]: self.addGeneGO(gene,id,gokey) elif go not in obselete_go: obselete_go.append(go) self.printLog('\r#GO','Mapping EnsEMBL GO from %s complete.' % os.path.basename(gfile))
def classify(self): ### Generate summary tables for each protein class '''Generate summary tables for each protein class.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() rankdb = self.db('taxamap') for cfile in self.list['Classify']: pclass = rje.baseFile(cfile,strip_path=True) clist = [] for fline in open(cfile,'r').readlines(): prot = string.split(rje.chomp(fline),maxsplit=1)[0] if prot: clist.append(prot) self.printLog('#CLASS','%s "%s" class proteins read from %s' % (rje.iLen(clist),pclass,cfile)) if not clist: self.warnLog('No proteins read from %s' % (cfile)) continue classdb = db.copyTable(rankdb,pclass) classdb.dropEntriesDirect('protein',clist,inverse=True) if not classdb.entries(): self.warnLog('No "%s" proteins found in TaxaMap table' % (pclass)) continue self.summaryScores(classdb,pclass,'MinClass') except: self.errorLog('%s.classify() error' % self.prog())
def loadPPI(self): ### Load pairwise interaction data '''Load pairwise interaction data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not rje.checkForFile(self.info['PPIFile']): return False ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in open(self.info['PPIFile'], 'r').readlines(): try: [pa, pb] = string.split(rje.chomp(line))[:2] except: continue for ppi in [(pa, pb), (pb, pa)]: if ppi[0] not in self.dict['PPI']: self.dict['PPI'][ppi[0]] = [] if ppi[1] not in self.dict['PPI'][ppi[0]]: self.dict['PPI'][ppi[0]].append(ppi[1]) self.progLog( '\r#PPI', 'Loading PPI data: %s proteins' % rje.integerString(len(self.dict['PPI']))) self.printLog( '\r#PPI', 'Loaded PPI data for %s proteins' % rje.integerString(len(self.dict['PPI']))) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def readHMMPFamSearch(self,resfile=None,readaln=False): ### Reads HMM PFam Search Results into objects ''' Reads HMM Search Results into objects. >> resfile:str = Results File (set as self.info['OutFile']) >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!! ''' try: ### Setup ### if not resfile or not os.path.exists(resfile): self.log.errorLog('Results file "%s" missing!' % resfile,printerror=False) return False ## Make RegExp for starting next alignment ## re_hit = string.join(['^(\S+):','domain','(\d+)','of','(\d+),','from','(\d+)','to','(\d+):','score','(\S+),','E','=','(\S+)'],'\s+') ## Search dictionary as results come back per sequence, not per HMM! ## pfam = {} # Dictionary of {PFam name:search} hitx = 0 # Total number of hits hitlist = [] # List of sequences processed from file (may or may not include zero hit sequences) ### Read in Search results ### if open(resfile,'r').readline().find('hmmpfam') != 0: self.errorLog('File "%s" does not appear to be an hmmpfam results file' % resfile,printerror=False) if rje.yesNo('Delete incorrect results file? (Check that hmmpfam=T is right!)',default='N'): os.unlink(resfile) self.printLog('#DEL','Dodgy results file "%s" deleted.' % resfile) return False hitname = None i = 0; hx = 0; seqx = 0 RESFILE = open(resfile,'r') #x#resline = self.loadFromFile(resfile,chomplines=True) #x#while i < len(resline): line = RESFILE.readline() newres = [rje.chomp(line)]; newresout = True; newresfile = '%s.partial' % resfile if os.path.exists(newresfile): os.unlink(newresfile) while line: self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx))) line = rje.chomp(line) #print line ## New Sequence ## if rje.matchExp('^Query sequence:\s+(\S+)',line): if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n')) newres = ['',line]; newresout = False hitname = rje.matchExp('^Query sequence:\s+(\S+)',line)[0]; hx += 1 #x#if hitname not in hitlist: hitlist.append(hitname) ## One Line Data for hits ## elif line.find('Parsed for domains:') == 0: #x#i += 3 # Skip two complete lines newres += [line,rje.chomp(RESFILE.readline()),rje.chomp(RESFILE.readline())] line = rje.chomp(RESFILE.readline()); newres.append(line) #Model Domain seq-f seq-t hmm-f hmm-t score E-value #-------- ------- ----- ----- ----- ----- ----- ------- #Lep_receptor_Ig 1/1 24 114 .. 1 103 [] 158.4 1.7e-44 # ... else ... # [no hits above thresholds] while rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line): newresout = True (dom,start,end,score,eval) = rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line) if not pfam.has_key(dom): pfam[dom] = self._addSearch() pfam[dom].info['Name'] = dom hit = pfam[dom]._addHit() hit.info['Name'] = hitname aln = hit._addAln() aln.setStat({'SbjStart':string.atoi(start),'SbjEnd':string.atoi(end),'Expect':string.atof(eval),'BitScore':string.atof(score)}) hitx += 1 self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx))) line = rje.chomp(RESFILE.readline()); newres.append(line) ## End of Protein ## elif line[:2] == '//': hitname = None; newres.append(line) elif rje.matchExp('End of rje_hmm reduced results file: (%d) sequences in original',line): seqx = string.atoi(rje.matchExp('End of rje_hmm reduced results file: (\d+) sequences in original',line)[0]) elif newres: newres.append(line) #x#i += 1 line = RESFILE.readline() if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n')) if not seqx: seqx = hx if self.opt['CleanRes']: open(newresfile,'a').write(string.join(['','End of rje_hmm reduced results file: %d sequences in original' % seqx],'\n')) os.unlink(resfile) os.rename(newresfile,resfile) self.printLog('\r#RED','Results file %s replaced with reduced version (%s Hits only)' % (resfile,rje.integerString(hitx))) self.printLog('\r#RES','Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(seqx),rje.integerString(len(pfam)),rje.integerString(hitx))) return True except: self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile)) return False
def qsub(self): ### Creates job and calls with qsub '''Creates job and calls with qsub. Returns qsub job ID or 0 if jobwait=True and job completed.''' try: ### Basics ### hr = int(self.stat['Walltime']) min = int((0.5 + (self.stat['Walltime'] - hr) * 60.0)) if self.opt['Report']: return self.report() jobstr = string.replace('%s.job' % self.info['Job'], '.job', '') jlist = [ '#!/bin/bash', '#PBS -N %s' % jobstr, #,'#PBS -q batch', '#PBS -l nodes=%d:ppn=%d' % (self.stat['Nodes'], self.stat['PPN']), '#PBS -l walltime=%d:%s:00' % (hr, rje.preZero(min, 60)), '#PBS -l vmem=%dgb' % self.getInt('VMem'), '#PBS -l mem=%dgb' % self.getInt('VMem'), '' ] #10 #if not os.popen('hostname').read().startswith('katana.science.unsw.edu.au'): # jlist[-2] = '#PBS -l mem=%dgb' % self.getInt('VMem') if self.getBool('Monitor'): if self.getBool('JobWait'): self.warnLog( 'Cannot run with wait=T and monitor=T: switched monitor=F' ) self.setBool({'Monitor': False}) else: jlist += ['#PBS -k oed'] if self.getStr('Email'): jlist += ['#PBS -M %s' % self.getStr('Email'), '#PBS -m ae'] if self.getBool('MailStart'): jlist[-1] = '#PBS -m bae' jlist += [ '### Define number of processors', 'NPROCS=`wc -l < $PBS_NODEFILE`', 'echo Running on host `hostname`', 'echo Time is `date`', 'echo Directory is `pwd`', #2 'echo This jobs runs on the following processors:', 'echo `cat $PBS_NODEFILE`', '', #5 'echo This job has allocated $NPROCS cpus', '' ] self.printLog( '#PPN', '%d Node(s) requested: %d PPN.' % (self.getInt('Nodes'), self.getInt('PPN'))) self.printLog('#VMEM', '%s GB VMem requested.' % (self.getStat('VMem'))) if self.getBool('ModPurge'): jlist.append('module purge') self.printLog('#MOD', 'Modules purged (modpurge=T)') for mod in self.list['Modules']: if mod.lower() not in ['', 'none']: jlist.append('module add %s' % mod) if self.list['Modules']: self.printLog( '#MOD', 'Modules added: %s' % string.join(self.list['Modules'], '; ')) for pcall in self.list['PreCall']: self.printLog('#PCALL', pcall) jlist.append(pcall) #x#jlist = ['#!/bin/sh'] # New Iridis shell script method! ### Directory & Program ### jlist.append('cd %s' % self.info['QPath']) pcall = self.info['Program'] if self.opt['RjePy']: pcall = 'python ' + self.info['PyPath'] + pcall jlist.append(pcall) ### Completion message jlist += ['', 'echo ---', 'qstat -f $PBS_JOBID', 'echo ---'] jlist += ['', 'echo', 'echo Time is `date`', 'echo Job complete'] ### Output and call ### job = '{0}.job'.format( jobstr ) #string.replace('%s.job' % self.info['Job'],'.job.job','.job') open(job, 'w').write(string.join(jlist, '\n')) self.printLog('#DIR', self.info['QPath']) self.printLog('#RUN', pcall) #qsub = 'qsub %s -S /bin/sh -l walltime=%d:%d:00,nodes=%d:ppn=2' % (job,hr,min,self.stat['Nodes']) qsub = 'qsub' if self.getBool('StartBash'): qsub += ' -S /bin/bash' if self.list['Depend']: qsub += ' -W depend=afterany' #for id in self.list['Depend']: qsub += ':%s.bio-server' % id myhost = self.getStr('DependHPC') if not self.getStrLC('DependHPC'): myhost = string.split(os.popen('hostname').read())[0] for id in self.list['Depend']: qsub += ':%s.%s' % (id, myhost) qsub += ' %s' % (job) self.printLog('#JOB', qsub) if self.test(): self.printLog('#TEST', 'Test mode: will not place job in queue.') self.verbose( 0, 1, string.join(['>>>>>'] + jlist + ['<<<<<', ''], '\n')) return False qrun = os.popen(qsub).read() self.printLog('#QSUB', qrun) qid = string.split(qrun, '.')[0] showstart = 'qstat -T' if os.popen('hostname').read().startswith( 'katana.science.unsw.edu.au'): showstart = 'showstart' self.printLog('#SHOW', 'Attempt %s %s in %s sec' % (showstart, qrun, self.stat['Pause']), log=False) time.sleep(self.stat['Pause']) for qline in os.popen('%s %s' % (showstart, qrun)): #qid): if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False) ### Wait for job to be completed if self.getBool('JobWait'): if self.getBool('Monitor'): raise ValueError('Cannot run with wait=T and monitor=T') self.printLog('#WAIT', 'Waiting for job {0} to finish'.format(qid)) ofile = '{0}.o{1}'.format( string.replace('%s.job' % self.info['Job'], '.job', ''), qid) running = False while not rje.exists(ofile): qstat = string.atoi( os.popen("qstat | grep '^{0}' -c".format( qid)).read().split()[0]) if not qstat: self.printLog( '#QSTAT', 'Job {0} disappeared from qstat'.format(qid)) break elif not running: try: qstat = string.split( os.popen("qstat | grep '^{0}'".format( qid)).read().split()[4]) if qstat == 'R': running = True self.printLog('#QSTAT', 'Job {0} running...'.format(qid)) except: pass time.sleep(max(1, self.getInt('Pause'))) owait = 300 while owait and not rje.exists(ofile): owait -= 1 time.sleep(1) if rje.exists(ofile): if 'Job complete' in os.popen( 'tail -n 1 {0}'.format(ofile)).read(): self.printLog( '#DONE', '{0} job ({1}) complete.'.format(jobstr, qid)) return 0 else: self.printLog( '#FAIL', '{0} job ({1}) failed to finish.'.format( jobstr, qid)) return qid else: self.printLog( '#FAIL', '{0} job ({1}) failed to generate {2}.'.format( jobstr, qid, ofile)) return qid except: self.errorLog('Error in qsub()') return False
def parseOMIM(self): ### Main parsing method '''Main parsing method.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Records'] = {} self.dict['Mutations'] = {} aas = string.split( string.join(rje_sequence.aa_code_3.values()).upper()) oline = os.path.exists(self.info['Name']) (olen, ox, mx) = (len(open(self.info['Name'], 'r').readlines()), 0.0, 0) OMIM = open(self.info['Name'], 'r') ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### record = gene = subid = disease = mutation = '' av = False # Whether reading *FIELD* AV for mutation data while oline: oline = OMIM.readline() self.log.printLog( '\r#OMIM', 'Processing OMIM: %.2f%% (%s genes)' % (ox / olen, rje.integerString(len(self.dict['Records']))), newline=False, log=False) ox += 100.0 if not av and oline[:1] != '*': continue line = rje.chomp(oline) while line[-1:] == ' ': line = line[:-1] ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if line == '*RECORD*': (record, av) = ('', False) elif line == '*FIELD* NO': # New record record = rje.chomp(OMIM.readline()) gene = '' ox += 100.0 ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* TI': # New gene gene = string.split(rje.chomp(OMIM.readline()))[-1] subid = '' av = False ox += 100.0 ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* AV': av = True # Start of mutation records elif av and rje.matchExp('^(\.\d+)', line): # New subid mutation record subid = rje.matchExp('^(\.\d+)', line)[0] disease = rje.chomp(OMIM.readline()) ox += 100.0 try: mutation = rje.matchExp( '^%s, (\D\D\D\d+\D\D\D)' % gene, rje.chomp(OMIM.readline()))[0] except: continue # No mutation or not coding change ox += 100.0 subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)', mutation) if subaa[0] not in aas or subaa[1] not in aas: continue if gene not in self.dict['Records']: self.dict['Records'][gene] = [record] if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record] if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {} mx += 1 self.dict['Mutations'][gene][subid] = (disease, mutation) ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### OMIM.close() self.log.printLog( '\r#OMIM', 'Processing OMIM complete! (%s genes; %s mutations)' % (rje.integerString(len( self.dict['Records'])), rje.integerString(mx))) self.saveMutations() except: self.log.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def mapSeq(self,seqlist,blast,search,outputmap=True): ### Performs actual mapping of sequence ''' Performs actual mapping of sequence. >> seq:SeqList object containing Sequence Object to be mapped >> blast:BLAST_Run object to perform BLAST and GABLAM >> search:Current BLAST search object for mapping >> outputmap:boolean = Whether to output mapping into a file [True] << returns shortName() of mapped sequence (or None if none) ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seq = seqlist.getSeq(format='tuple') mapseq = self.obj['MapDB'] hits = blast.db('Hit').indexEntries('Query',search) self.printLog('#HITS','%s vs %s = %d hits' % (search,blast.str['DBase'],len(hits))) hitseq = {}; hitdata = {} for entry in hits: hitseq[entry['Hit']] = mapseq.getDictSeq(entry['Hit'],format='tuple') hitdata[entry['Hit']] = entry resdict = {'Query':search,'Hit':None,'Method':'Failed','Query_Species':rje_sequence.specCodeFromName(seq[0])} ### ~ [1] Order Hits and Check Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hits,hitdict) = self.orderHits(seq,hits,hitseq) self.debug(hits) self.debug(hitdict) ### ~ [2] Attempt mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for method in self.list['Mapping']: resdict['Hit'] = self.mapHit(seq,hits,hitdict,method.lower()) if resdict['Hit']: resdict['Method'] = method[:1].upper() + method[1:].lower() break elif method == 'gablam' and (len(hits) > 0): resdict['Method'] = 'Rejected' self.debug(resdict) ### ~[3] Output! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if resdict['Hit']: #hitdict[hit]['Data']['ShortName'] hit = resdict['Hit']['Hit'] # resdict['Hit'] is the BLAST table entry for Hit shortname = hitdict[hit]['Data']['ShortName'] # This is just hit! self.printLog('#MAP','%s mapped to %s (by %s)' % (string.split(seq[0])[0],shortname,resdict['Method'])) ## Update Stats ## self.debug('') resdict['BlastRank'] = hitdata[hit]['Rank'] for key in hitdict[hit]: resdict[key] = hitdict[hit][key] ## Fasta and Redundancy ## if shortname in self.list['Mapped']: self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas'])) else: self.list['Mapped'].append(shortname) if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (hitseq[hit][0],hitseq[hit][1])) resdict['Hit_Species'] = hitdict[hit]['Data']['SpecCode'] resdict['Hit'] = shortname else: ### ~ [2] GREP-based search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if 'grep' in self.list['Mapping']: greplist = []; hitseq = '' self.printLog('#GREP','grep %s %s -B 1' % (seq[1],blast.str['DBase']),log=False) for line in os.popen('grep %s %s -B 1' % (seq[1],blast.str['DBase'])).readlines(): if line[:1] == '>': greplist.append(string.split(line[1:])[0]) elif not hitseq: hitseq = rje.chomp(line) if greplist: shortname = greplist.pop(0) resdict['Hit'] = shortname resdict['Method'] = 'Grep' resdict['Qry_ID'] = '100.0' resdict['Qry_Len'] = len(seq[1]) resdict['Hit_Len'] = len(hitseq) resdict['Hit_ID'] = 100.0 * len(hitseq) / len(seq[1]) try: resdict['Hit_Species'] = string.split(shortname,'_')[1] except: pass if shortname in self.list['Mapped']: self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas'])) else: self.list['Mapped'].append(shortname) if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (shortname,hitseq)) for extra in greplist: self.printLog('#GREP','Warning! Query "%s" also hit "%s" with grep!' % (string.split(seq[0])[0],extra)) if not resdict['Hit'] and self.bool['Combine']: ## Fasta and Redundancy ## shortname = string.split(seq[0])[0] if shortname in self.list['Mapped']: self.printLog('#FAS','%s already in output - not duplicating in %s' % (shortname,self.str['MapFas'])) else: self.list['Mapped'].append(shortname) if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1])) elif outputmap: open(self.str['MissFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1])) self.printLog('#MISS','%s mapping %s' % (resdict['Query'],resdict['Method'])) if outputmap: rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict) return resdict['Hit'] except: self.errorLog('Fudgesticks! SeqMapper.mapSeq(%s) has died!' % seq[0],quitchoice=True) return False
def readHMMPFamSearch( self, resfile=None, readaln=False): ### Reads HMM PFam Search Results into objects ''' Reads HMM Search Results into objects. >> resfile:str = Results File (set as self.info['OutFile']) >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!! ''' try: ### Setup ### if not resfile or not os.path.exists(resfile): self.log.errorLog('Results file "%s" missing!' % resfile, printerror=False) return False ## Make RegExp for starting next alignment ## re_hit = string.join([ '^(\S+):', 'domain', '(\d+)', 'of', '(\d+),', 'from', '(\d+)', 'to', '(\d+):', 'score', '(\S+),', 'E', '=', '(\S+)' ], '\s+') ## Search dictionary as results come back per sequence, not per HMM! ## pfam = {} # Dictionary of {PFam name:search} hitx = 0 # Total number of hits hitlist = [ ] # List of sequences processed from file (may or may not include zero hit sequences) ### Read in Search results ### if open(resfile, 'r').readline().find('hmmpfam') != 0: self.errorLog( 'File "%s" does not appear to be an hmmpfam results file' % resfile, printerror=False) if rje.yesNo( 'Delete incorrect results file? (Check that hmmpfam=T is right!)', default='N'): os.unlink(resfile) self.printLog('#DEL', 'Dodgy results file "%s" deleted.' % resfile) return False hitname = None i = 0 hx = 0 seqx = 0 RESFILE = open(resfile, 'r') #x#resline = self.loadFromFile(resfile,chomplines=True) #x#while i < len(resline): line = RESFILE.readline() newres = [rje.chomp(line)] newresout = True newresfile = '%s.partial' % resfile if os.path.exists(newresfile): os.unlink(newresfile) while line: self.progLog( '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(hx), rje.integerString(len(pfam)), rje.integerString(hitx))) line = rje.chomp(line) #print line ## New Sequence ## if rje.matchExp('^Query sequence:\s+(\S+)', line): if newres and newresout and self.opt['CleanRes']: open(newresfile, 'a').write(string.join(newres, '\n')) newres = ['', line] newresout = False hitname = rje.matchExp('^Query sequence:\s+(\S+)', line)[0] hx += 1 #x#if hitname not in hitlist: hitlist.append(hitname) ## One Line Data for hits ## elif line.find('Parsed for domains:') == 0: #x#i += 3 # Skip two complete lines newres += [ line, rje.chomp(RESFILE.readline()), rje.chomp(RESFILE.readline()) ] line = rje.chomp(RESFILE.readline()) newres.append(line) #Model Domain seq-f seq-t hmm-f hmm-t score E-value #-------- ------- ----- ----- ----- ----- ----- ------- #Lep_receptor_Ig 1/1 24 114 .. 1 103 [] 158.4 1.7e-44 # ... else ... # [no hits above thresholds] while rje.matchExp( string.join([ '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)', '(\S+)\s*$' ], '\s+'), line): newresout = True (dom, start, end, score, eval) = rje.matchExp( string.join([ '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)', '(\S+)\s*$' ], '\s+'), line) if not pfam.has_key(dom): pfam[dom] = self._addSearch() pfam[dom].info['Name'] = dom hit = pfam[dom]._addHit() hit.info['Name'] = hitname aln = hit._addAln() aln.setStat({ 'SbjStart': string.atoi(start), 'SbjEnd': string.atoi(end), 'Expect': string.atof(eval), 'BitScore': string.atof(score) }) hitx += 1 self.progLog( '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(hx), rje.integerString( len(pfam)), rje.integerString(hitx))) line = rje.chomp(RESFILE.readline()) newres.append(line) ## End of Protein ## elif line[:2] == '//': hitname = None newres.append(line) elif rje.matchExp( 'End of rje_hmm reduced results file: (%d) sequences in original', line): seqx = string.atoi( rje.matchExp( 'End of rje_hmm reduced results file: (\d+) sequences in original', line)[0]) elif newres: newres.append(line) #x#i += 1 line = RESFILE.readline() if newres and newresout and self.opt['CleanRes']: open(newresfile, 'a').write(string.join(newres, '\n')) if not seqx: seqx = hx if self.opt['CleanRes']: open(newresfile, 'a').write( string.join([ '', 'End of rje_hmm reduced results file: %d sequences in original' % seqx ], '\n')) os.unlink(resfile) os.rename(newresfile, resfile) self.printLog( '\r#RED', 'Results file %s replaced with reduced version (%s Hits only)' % (resfile, rje.integerString(hitx))) self.printLog( '\r#RES', 'Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(seqx), rje.integerString( len(pfam)), rje.integerString(hitx))) return True except: self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile)) return False
def exonerate(self,qryfas, genome, model,exonerate='exonerate',bestn=0): ''' Runs exonerate and parses output into lists for processing. { query: {'gff':[outputlines], 'cigar':[outputlines], 'alignment':[outputlines], 'vulgar':[[headerlist], {header:value}, {header:value}, ...] } ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### EXFILE = None exfile = '%s.%s' % (self.baseFile(),model) # Used in memsaver mode query_dic = {} header_list = ['query_id', 'query_start', 'query_end', 'query_strand', 'target_id', 'target_start', 'target_end', 'target_strand', 'score', '<label, query_length, target_length> triplets'] excmd = [exonerate, qryfas, genome, '--showtargetgff', '--showcigar'] if model: excmd += ['--model', model] if bestn: excmd += ['--bestn', '%d' % bestn] if self.getStrLC('ExOpt'): excmd += string.split(self.getStr('ExOpt')) self.printLog('#RUN',string.join(excmd)) extext = [] if self.getBool('MemSaver'): gzfile = '%s.gz' % exfile if rje.exists(gzfile): self.gUnzip(gzfile) if rje.exists(exfile) and not self.force(): self.printLog('#EXFILE','Found %s (force=F). Assuming complete.' % exfile) else: rje.backup(self,exfile) self.printLog('#SAVER','memsaver=T: Exonerate output directed to %s.' % exfile) EXFILE = open(exfile,'w') if subprocess.call(excmd, stdout=EXFILE): raise IOError('Exonerate call did not complete!') EXFILE.close() self.printLog('#EXFILE','%s generated.' % exfile) EXFILE = open(exfile,'r') else: extext = Popen(excmd, stdout=PIPE).stdout.readlines() output_format = '' while extext or EXFILE: #line = process.stdout.readline().rstrip() if EXFILE: line = EXFILE.readline() if not line: break line = rje.chomp(line) else: line = rje.chomp(extext.pop(0)) if line: if line.startswith(' Query:'): query = line.split(':', 1)[1].split(' ')[1] #for q in rje.sortKeys(query_dic): # self.bugPrint('%s: %s' % (q,rje.sortKeys(query_dic[q]))) #self.debug(query) if line == 'C4 Alignment:': output_format = 'alignment' elif line == '# --- START OF GFF DUMP ---': output_format = 'gff' elif line.startswith('vulgar:'): output_format = 'vulgar' fields = line.split(' ', 10)[1:] if output_format in query_dic[query]: query_dic[query][output_format].append({}) else: query_dic[query][output_format] = [header_list, {}] for header, field in zip(header_list, fields): query_dic[query][output_format][-1][header] = field #self.debug(query_dic[query][output_format]) elif line.startswith('cigar:'): output_format = 'cigar' if output_format in query_dic[query]: query_dic[query][output_format].append(line.replace('cigar: ', '')) else: query_dic[query][output_format] = [line.replace('cigar: ', '')] elif line == '------------' or line.startswith('Command line:') or line.startswith('Hostname:') or line == '# --- END OF GFF DUMP ---' or line == '#' or line.startswith('-- completed exonerate analysis'): pass elif output_format: if query in query_dic: if output_format in query_dic[query]: query_dic[query][output_format].append(line) else: query_dic[query][output_format] = [line] else: query_dic[query] = {output_format:[line]} #elif process.poll() is not None: # break elif output_format == 'alignment': try: query_dic[query][output_format].append(line) except: pass self.vPrint(line,v=1) if EXFILE: EXFILE.close() if self.getBool('Cleanup'): os.unlink(exfile) self.printLog('#CLEAN','%s deleted.' % exfile) elif self.getBool('GZip'): self.gZip(exfile) return query_dic except: self.errorLog('%s.exonerate error' % self.prog()); raise
def loadTimePoints(self,filename): ### Load TimePoints from file of various formats '''Load TimePoints from file of various formats.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(filename): return self.errorLog('File %s missing!' % filename) data = open(filename,'r').readlines() db = self.db('TimePoints') ### ~ [2] Load from File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [2a] Delimited File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if string.split(data[0])[0] == 'TimePoint Name': # ftype = 'delimited text file' temp = self.db().addTable(filename,mainkeys=['TimePoint Name'],name='temp') for entry in temp.entries(): db.addEntry(entry) db.deleteTable(temp) ## ~ [2b] File of Database Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif data[0][0] == '(': ftype = 'database string' for line in data: line = rje.chomp(line) while line[-1:] == ' ': line = line[:-1] pdata = string.split(string.replace(line[2:-3],', ',','),"','") if not pdata: continue if rje.matchExp('^(\d+)$',pdata[0]): pdata.pop(0) # Database output with key ID numbers entry = {} for field in db.fields(): entry[field] = pdata[db.fields().index(field)] db.addEntry(entry) ## ~ [2c] Glossary Text File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: ftype = 'glossary text file' for line in data: if '(TimePoint)' not in line: continue # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history) pdata = string.split(line,'. ') if pdata[2][-2:] == 'ya': pdata[1] = '%s. %s' % (pdata[1],pdata.pop(2)) entry = {'TimePoint Name':pdata[0]} try: entry['Source URL'] = rje.matchExp('Source: <(\S+)>',line)[0] except: self.errorLog('Cannot read Source URL') try: entry['TimePoint Description'] = rje.matchExp('^(\S.+\S) Source: <',string.join(pdata[2:],'. '))[0] except: self.errorLog('Cannot read TimePoint Description: %s' % line) if pdata[1][-2:] == 'ya': [entry['Year'],entry['yearUnit']] = string.split(pdata[1])[-2:] else: try: ydata = rje.matchExp('(\d+) (\S+), (\d+) (\S+)$',pdata[1]) if ydata: for i in range(4): entry[['Year','yearUnit','month','day'][i]] = ydata[i] else: (entry['Year'],entry['yearUnit']) = rje.matchExp('(\d+) (\S+)$',pdata[1]) except: self.errorLog('Cannot parse time from %s' % pdata[1]) kfield = ['keyword1','keyword2','keyword3','keyword4','keyword5'] try: keywords = string.split(rje.matchExp('\(Keywords: (\S.+)\)',pdata[-1])[0],', ') while keywords and kfield: entry[kfield.pop(0)] = keywords.pop(0) while kfield: entry[kfield.pop(0)] = 'blank' if keywords: self.printLog('#ERR','%d extra Keywords (%s)!' % (len(keywords),string.join(keywords,', '))) except: self.errorLog('Cannot read Keywords (%s)' % pdata[-1]) db.addEntry(entry) ### ~ [3] Summarise Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#TP','Timepoints read from %s: %s TimePoints total.' % (ftype,db.entryNum())) return True except: self.errorLog('%s.loadTimePoints(%s) error' % (self,filename)); return False
def parseGO(self,glines,clear=True,obselete=False): ### Parses GO Data from list of glines from OBO file ''' Parses GO Data from list of glines from OBO file. >> glines:list of text lines read from OBO file >> clear:opt [True] = Whether to clear self.dict before reading in data >> obselete:opt [False] = Whether to read in obselete terms << returns True/False depending on success ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if clear: self.dict['AltID'] = {} self.dict['GO'] = {} self.dict['Subset'] = {} id = 'subsets' # Current term being parsed ### ~ [2] ~ Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (gx,gtot) = (0.0,len(glines)) while glines: self.printLog('\r#PARSE','Parsing %s GO terms: %.1f%%' % (rje.integerString(len(self.dict['GO'])),gx/gtot),newline=False,log=False) gx += 100.0 ## ~ [2a] ~ Establish ID of current GO terms ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gline = rje.chomp(glines.pop(0)) if rje.matchExp('^id:\s+GO:(\d+)',gline): id = rje.matchExp('^id:\s+GO:(\d+)',gline)[0] self.dict['GO'][id] = {} continue elif not id: continue elif rje.matchExp('^(\S+):\s+(\S.+)$',gline): (type,data) = rje.matchExp('^(\S+):\s+(\S.+)$',gline) elif gline[:1] in ['[','']: id = ''; continue ## ~ [2b] ~ Parse details ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## try: if type == 'is_obsolete' and data.lower()[:4] == 'true': self.dict['GO'].pop(id) id = '' elif type in ['name','def']: self.dict['GO'][id][type] = data elif rje.matchExp('^subsetdef: (\S+) \"(\S.+)\"',gline): (subset,desc) = rje.matchExp('^subsetdef: (\S+) \"(\S.+)\"',gline) self.dict['Subset'][subset] = {'name':desc,'terms':[]} elif type == 'namespace': g = string.split(data,'_') self.dict['GO'][id]['type'] = '%s%s' % (g[0][0],g[1][0]) elif type in ['is_a','relationship']: parent = rje.matchExp('GO:(\d+)',data)[0] if type != 'is_a': type = string.split(data)[0] if type not in self.list['ParentTerms']: self.list['ParentTerms'].append(type) if type not in self.dict['GO'][id]: self.dict['GO'][id][type] = [] self.dict['GO'][id][type].append(parent) elif type == 'subset': self.dict['Subset'][string.split(gline)[1]]['terms'].append(id) elif type == 'alt_id': alt_id = rje.matchExp('GO:(\d+)',data)[0] if alt_id in self.dict['AltID']: self.dict['AltID'][alt_id].append(id) else: self.dict['AltID'][alt_id] = [id] elif type in ['xref','synonym']: if type not in self.dict['GO'][id]: self.dict['GO'][id][type] = [] self.dict['GO'][id][type].append(data) except: self.errorLog('GO.parseGO(%s) error' % gline) self.printLog('\r#PARSE','Parsed %s GO terms and %d subsets.' % (rje.integerString(len(self.dict['GO'])),len(self.dict['Subset']))) ### ~ [3] ~ Tidy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.makeChildren() self.makeGOSlim() for subset in self.dict['Subset']: self.dict['Subset'][subset]['terms'].sort() self.list['ParentTerms'].sort() return True except: self.log.errorLog('GO.parseGO() failed') return False
def loadTimePoints( self, filename): ### Load TimePoints from file of various formats '''Load TimePoints from file of various formats.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(filename): return self.errorLog('File %s missing!' % filename) data = open(filename, 'r').readlines() db = self.db('TimePoints') ### ~ [2] Load from File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [2a] Delimited File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if string.split(data[0])[0] == 'TimePoint Name': # ftype = 'delimited text file' temp = self.db().addTable(filename, mainkeys=['TimePoint Name'], name='temp') for entry in temp.entries(): db.addEntry(entry) db.deleteTable(temp) ## ~ [2b] File of Database Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif data[0][0] == '(': ftype = 'database string' for line in data: line = rje.chomp(line) while line[-1:] == ' ': line = line[:-1] pdata = string.split(string.replace(line[2:-3], ', ', ','), "','") if not pdata: continue if rje.matchExp('^(\d+)$', pdata[0]): pdata.pop(0) # Database output with key ID numbers entry = {} for field in db.fields(): entry[field] = pdata[db.fields().index(field)] db.addEntry(entry) ## ~ [2c] Glossary Text File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: ftype = 'glossary text file' for line in data: if '(TimePoint)' not in line: continue # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history) pdata = string.split(line, '. ') if pdata[2][-2:] == 'ya': pdata[1] = '%s. %s' % (pdata[1], pdata.pop(2)) entry = {'TimePoint Name': pdata[0]} try: entry['Source URL'] = rje.matchExp( 'Source: <(\S+)>', line)[0] except: self.errorLog('Cannot read Source URL') try: entry['TimePoint Description'] = rje.matchExp( '^(\S.+\S) Source: <', string.join(pdata[2:], '. '))[0] except: self.errorLog('Cannot read TimePoint Description: %s' % line) if pdata[1][-2:] == 'ya': [entry['Year'], entry['yearUnit']] = string.split(pdata[1])[-2:] else: try: ydata = rje.matchExp('(\d+) (\S+), (\d+) (\S+)$', pdata[1]) if ydata: for i in range(4): entry[['Year', 'yearUnit', 'month', 'day'][i]] = ydata[i] else: (entry['Year'], entry['yearUnit']) = rje.matchExp( '(\d+) (\S+)$', pdata[1]) except: self.errorLog('Cannot parse time from %s' % pdata[1]) kfield = [ 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5' ] try: keywords = string.split( rje.matchExp('\(Keywords: (\S.+)\)', pdata[-1])[0], ', ') while keywords and kfield: entry[kfield.pop(0)] = keywords.pop(0) while kfield: entry[kfield.pop(0)] = 'blank' if keywords: self.printLog( '#ERR', '%d extra Keywords (%s)!' % (len(keywords), string.join(keywords, ', '))) except: self.errorLog('Cannot read Keywords (%s)' % pdata[-1]) db.addEntry(entry) ### ~ [3] Summarise Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog( '#TP', 'Timepoints read from %s: %s TimePoints total.' % (ftype, db.entryNum())) return True except: self.errorLog('%s.loadTimePoints(%s) error' % (self, filename)) return False
def splitMascot(self): ### Reads the MASCOT file and splits into header, hits and unmatched files. '''Reads the MASCOT file and splits into header, hits and unmatched files.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() infile = self.getStr('MASCOT') if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('MASCOT'))) #x#self.deBug(self.basefile()) headfile = '%s.header.txt' % self.basefile() hitsfile = '%s.mascot.csv' % self.basefile() peptfile = '%s.nohits.csv' % self.basefile() if rje.isYounger(self.getStr('MASCOT'),hitsfile) == hitsfile and not self.force(): return self.printLog('#FILE','%s file found (force=F)' % hitsfile) ### ~ [1] Split MASCOT~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headlines = [] csvhead = [] mdb = None mx = 0 itraq = [] prot_data = {} for mline in open(self.getStr('MASCOT'),'r').readlines(): mx += 1 # Index of next line in case needed for iTRAQ reading! ## ~ [1a] Skip down until Header found ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not headlines and mline.find('Header') < 0: continue ## ~ [1b] Add Header lines to headlines until results headers found ~~~~~~~~~~~~~~~ ## if not csvhead and mline.find('prot_hit_num') < 0: headlines.append(mline); continue ## ~ [1c] Sort out MASCOT results headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if mline.find('prot_hit_num') >= 0: ## ~ Read Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## open(headfile,'w').writelines(headlines) csvhead = rje.readDelimit(string.join(string.split(rje.chomp(mline))),',') while '' in csvhead: csvhead.remove('') ## ~ Sort out iTRAQ headers (missing) ~~~~~~~~~ ## if self.getBool('iTRAQ'): iline = open(self.getStr('MASCOT'),'r').readlines()[mx] for isplit in rje.readDelimit(iline,',')[len(csvhead):]: # Should be start of iTRAQ data if '/' in isplit: itraq.append(isplit) self.printLog('#ITRAQ',string.join(itraq)) csvhead += itraq idb = db.addEmptyTable('itraq',['prot_hit_num','prot_acc','prot_desc','itraq','ratio','n','geomean','summary'],keys=['prot_hit_num','itraq']) idb.info['Delimit'] = ',' ## ~ Add emPAI header (also missing) ~~~~~~~~~~ ## if self.getBool('emPAI'): csvhead.append('empai') ## ~ Set up Database Table ~~~~~~~~~~~~~~~~~~~~ ## self.printLog('#HEAD',string.join(csvhead,'; ')) mdb = db.addEmptyTable('mascot',csvhead,keys=['prot_hit_num','pep_query']) mdb.info['Delimit'] = ',' elif mline.find('Peptide matches') >= 0: mdb.saveToFile() if self.getBool('emPAI'): csvhead.remove('empai') mdb = db.addEmptyTable('nohits',csvhead,keys=['pep_query']) for field in mdb.fields(): if field[:4] == 'prot': mdb.dropField(field) mdb.info['Delimit'] = ',' continue elif rje.chomp(mline): #self.deBug('%s ... %s' % (mline[:20],mline.find('Peptide matches'))) data = rje.readDelimit(mline,',') entry = {}; pretraq = True #self.deBug(csvhead); self.deBug(itraq); for d in range(len(csvhead)+len(itraq)): if d >= len(data): break if data[d] in itraq: dhead = data[d]; pretraq = False elif data[d] == 'emPAI': entry['empai'] = data[d+1]; pretraq = False elif pretraq and d < len(csvhead): dhead = csvhead[d] elif pretraq: continue # Unmatched peptides will not have emPAI or iTRAQ data #self.deBug('%s > %s' % (data[d],dhead)) if d and data[d-1] == 'emPAI': continue elif data[d] in itraq + ['emPAI']: continue elif dhead not in entry: entry[dhead] = data[d] #self.deBug('%s = %s' % (dhead,entry[dhead])) if entry['prot_acc']: prot_data[entry['prot_hit_num']] = {'prot_acc':entry['prot_acc'],'prot_desc':entry['prot_desc']} if self.getBool('iTRAQ') and 'Quantitation summary for protein' in data: d = data.index('Quantitation summary for protein') + 1 if entry['prot_hit_num'] in prot_data: pacc = prot_data[entry['prot_hit_num']]['prot_acc'] pdesc = prot_data[entry['prot_hit_num']]['prot_desc'] else: pacc = entry['prot_acc'] pdesc = entry['prot_desc'] while d < len(data): if data[d] in itraq: idb.addEntry({'prot_hit_num':entry['prot_hit_num'],'prot_acc':pacc,'prot_desc':pdesc, 'itraq':data[d],'ratio':data[d+1],'n':data[d+2],'geomean':data[d+3],'summary':data[d+4]}) d += 1 #self.deBug(entry) if entry['prot_hit_num'] or entry['pep_query']: mdb.addEntry(entry) mdb.saveToFile() if self.getBool('iTRAQ'): idb.saveToFile() self.deBug('') return True except: self.errorLog('Error reading MASCOT file'); return False
def parsePileup(self,tname,filename,wtdb=None): ### Extracts, filters and processes PileUp data '''Extracts, filters and processes PileUp data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### table = self.db().addEmptyTable(tname,['Locus','Pos','Seq','N','QN','Major','MajFreq'],keys=['Locus','Pos']) qc = [] if wtdb: table.addField('WTFreq') PILEUP = open(filename,'r'); px = 0; ex = 0 PILEOUT = open('%s.%s.tdt' % (self.baseFile(),tname),'w') rje.writeDelimit(PILEOUT,outlist=table.fields(),delimit='\t') locus = None refseq = '' #? What is this used for? majors = [] #? What is this used for? ### ~ [2] Process each entry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in PILEUP: # Split line up into data. Should be: locus, position, reference, no. reads, read data, qualscores data = string.split(rje.chomp(line)) if not data: break self.progLog('\r#PARSE','Parsing %s: %s pos...' % (filename,rje.iStr(px)),rand=0.01); px += 1 ## ~ [2a] Extract Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## entry = {'Locus':data[0],'Pos':int(data[1]),'Seq':data[2],'N':int(data[3]),'QN':0} if entry['Locus'] != locus: locus = entry['Locus']; refseq = ''; majors = [] refseq += data[2] #entry => 'Ref','Pos','Seq','N','Reads','Qual' rseq = data[4] reads = [] delx = 0 while rseq: try: if rseq[:1] in ['.',',']: reads.append(entry['Seq']); rseq = rseq[1:] elif rseq[:1] == '^': rseq = rseq[2:] #elif rseq[:1] == '*': # reads.append('-1%s' % entry['Seq'].upper()) # rseq = rseq[1:] elif rseq[:1] in ['-','+']: ilen = string.atoi(rje.matchExp('^(\d+)',rseq[1:])[0]) indel = rseq[len('%s' % ilen)+1:][:ilen] #self.deBug('%s: %s' % (rseq,indel)) if rseq[:1] == '-': delx += 1 reads.append(rseq[:len('%s' % ilen)+ilen+1].upper()) else: reads[-1] += indel.upper() #self.deBug(reads[-1]) rseq = rseq[len('%s' % ilen)+ilen+1:] elif rseq[:1] in ['$']: rseq = rseq[1:] else: if rseq[0].upper() not in 'ATGCN*': print ' ???', rseq[0].upper(), '???' reads.append(rseq[0].upper()); rseq = rseq[1:] except: self.errorLog('!') self.deBug(rseq) raise ValueError if len(reads) != (entry['N'] + delx): self.deBug('%s = %d' % (data[4],entry['N'])) self.deBug('%s = %d' % (reads,len(reads))) self.errorLog('Read versus Read Count mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False) raise ValueError ## ~ [2b] Convert Quality Scores ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## qual = [] for q in data[5]: # Gaps do not have a quality score, so fill these in first while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut')) # Then append actual qv qual.append(ord(q) - 33) qc += [0] * (qual[-1] - len(qc)); qc[qual[-1]-1] += 1 while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut')) while '*' in reads: reads[reads.index('*')] = '-' #'-1%s' % entry['Seq'].upper() if len(reads) != len(qual): self.deBug('%s = %d' % (reads,len(reads))) self.deBug('%s = %d' % (qual,len(qual))) self.deBug(data) self.errorLog('Read versus Quality length mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False) raise ValueError ## ~ [2c] Filter low quality ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if entry['Pos'] in [190359]: #100,98901,183697,169284, self.deBug(qual) self.deBug(reads) self.deBug(qc) # Remove (from back) any reads than do not meet QV cutoff for r in range(len(qual)-1,-1,-1): if qual[r] < self.getInt('QCut'): qual.pop(r); reads.pop(r) entry['QN'] = len(reads) ## ~ [2d] Major Allele ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## alleles = {} # Dictionary of {nt:count} # Setup major allele if reads: major = reads[0] else: major = '-'; alleles[major] = 0 # Cycle through reads. Keep most abundant allele as major - or reference allele if tied. for read in reads: if read in alleles: alleles[read] += 1 else: alleles[read] = 1 if alleles[read] > alleles[major] or (read == entry['Seq'] and alleles[read] == alleles[major]): major = read entry['Major'] = major majors.append(major) if reads: entry['MajFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[major]) / float(len(reads))) else: entry['MajFreq'] = 0.0 if wtdb: try: wtmajor = self.dict['WTMajor'][locus][entry['Pos']-1] if wtmajor in alleles and reads: entry['WTFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[wtmajor]) / float(len(reads))) else: entry['WTFreq'] = 0.0 if wtmajor != major: self.debug(entry) elif locus == 'chrIV_S288C__BK006938.2' and entry['Pos'] == 271733: self.debug(entry) except: self.warnLog('WTFreq Error (%s:Pos=%d) [Probably no WT read mapped]' % (locus,entry['Pos'])); entry['WTFreq'] = 0.0 if entry['Pos'] in [190359]: #100,98901,183697,169284, self.deBug(qual) self.deBug(reads) self.deBug(alleles) self.deBug(entry) self.deBug(line) #table.addEntry(entry) outlist = [] for field in table.fields(): outlist.append(entry[field]) rje.writeDelimit(PILEOUT,outlist,delimit='\t'); ex += 1 self.printLog('\r#PARSE','Parsed %s: %s entries from %s lines.' % (filename,rje.iStr(ex),rje.iStr(px))) PILEOUT.close() PILEUP.close() ### ~ [3] Save QC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### QC = open('%s.%s.QC.tdt' % (self.baseFile(),tname),'w') QC.write('Qual\tCount\n') for q in range(len(qc)): try: QC.write('%d\t%d\n' % (q+1,qc[q])) except: self.errorLog('!') QC.close() return table except: self.errorLog('%s.parsePileup(%s) error' % (self,filename)); return None