Пример #1
0
 def makePPI(self):  ### Generates files for Human-HIV PPI analysis
     '''Generates files for Human-HIV PPI analysis.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(
             self.log, self.cmd_list +
             ['seqin=%s' % self.getStr('HIVSeq'), 'autoload=T'])
         if not seqlist.seqs(): return False
         seqmap = seqlist.seqNameDic('Max')
         mdb = self.db('HHPIDMap')
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for hivacc in mdb.index('AccHIV'):
             # map HIV accession numbers on to sequences seqNameDic
             accnum = string.split(hivacc, '.')[0]
             hivseq = seqmap[accnum]
             # extract short HIV name from sequence ID
             hivgene = string.split(hivseq.shortName(), '_')[0].upper()
             # create directory named after HIV gene
             #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene))
             rje.mkDir(self, '%s/' % hivgene, log=True)
             # copy human PPI files into directories, adding HIV gene
             ex = 0.0
             etot = len(mdb.index('AccHIV')[hivacc])
             for entry in mdb.indexEntries('AccHIV', hivacc):
                 self.progLog(
                     '\r#PPI',
                     'Generating human-HIV PPI fasta files for %s %s PPI' %
                     (rje.iStr(etot), hivgene))
                 pfile = self.getStr(
                     'PPIDir') + entry['Symbol'] + '.ppi.fas'
                 if rje.exists(pfile):
                     FAS = open(
                         '%s/%s.%s.ppi.fas' %
                         (hivgene, hivgene.lower(), entry['Symbol']), 'w')
                     FAS.write('>%s\n%s\n' %
                               (hivseq.info['Name'], hivseq.getSequence()))
                     FAS.write(open(pfile, 'r').read())
                     FAS.close()
                 else:
                     self.errorLog(
                         'Cannot find human PPI file for %s interactor "%s"'
                         % (entry['HIV'], entry['Symbol']),
                         printerror=False)
             self.printLog(
                 '\r#PPI',
                 'Generated human-HIV PPI fasta files for %s %s (%s) PPI.' %
                 (rje.iStr(etot), entry['HIV'], hivgene))
     except:
         self.errorLog('%s.makePPI error' % self)
         return False
Пример #2
0
 def outputCards(self):  ### Outputs cards to delimited file
     '''Outputs cards to delimited file.'''
     ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     genelist = self.list['Genes']
     if self.opt['Purify'] and self.opt['Restrict']:
         for gene in genelist[0:]:
             if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']:  # Replace with symbol
                 genelist.remove(gene)
                 if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol'])
     delimit = rje.delimitFromExt(filename=self.info['CardOut'])
     CARDOUT = open(self.info['CardOut'],'a')
     ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     (noens,noloci,ox) = (0,0,0)
     for gene in rje.sortKeys(self.dict['GeneCard']):
         if self.opt['Restrict'] and gene not in genelist: continue
         elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue
         self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1
         self.dict['GeneCard'][gene]['Alias'] = gene
         self.dict['GeneCard'][gene]['Species'] = self.info['Species']
         rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene])
         if self.dict['GeneCard'][gene]['Symbol'] == gene:   # Not an alias
             if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1
             if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1
     CARDOUT.close()
     self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut']))
     self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))
Пример #3
0
 def emptyToBlank(self):     ### Replace empty values with 'blank' values
     '''Replace empty values with 'blank' values.'''
     db = self.db('TimePoints'); bx = 0
     for entry in db.entries():
         for field in db.fields():
             if entry[field] == '': entry[field] = 'blank'; bx += 1
     self.printLog('#DB','%s empty values represented with blank values' % rje.iStr(bx))
Пример #4
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] Check and modify URL if required ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.getStr('RestIn').startswith('http:'):
             #!# Check for rest URL and add if missing
             #!# Split on &
             restcmd = string.split(self.getStr('RestIn'),'&')
             for i in range(len(restcmd)):
                 if '=' not in restcmd[i]: continue
                 (opt,value) = string.split(restcmd[i],'=',1)
                 if value.startswith('file:'):   # Conversion of cmd=file:FILE into cmd=CONTENT
                     rfile = string.split(value,':',1)[1]
                     #!# Consider adding max size constraint. Probably a URL size limit.
                     if rje.exists(rfile):
                         restcmd[i] = '%s=%s' % (opt,rje.chomp(string.join(open(rfile,'r').readlines(),'\\n')))
                         if '&' in restcmd[i]:
                             self.warnLog('%s "&" => "+" conversions for %s.' % (rje.iStr(restcmd[i].count('&')),rfile))
                             restcmd[i] = string.replace(restcmd[i],'&','+')
                     else: self.warnLog('File "%s" not found.' % rfile,quitchoice=True)
             self.setStr({'RestIn':string.join(restcmd,'&')})
         ## ~ [1b] Direct Parsing of output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:   # Convert to file
             self.setStr({'RestIn':rje.makePath(self.getStr('RestIn'),True)})
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Пример #5
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = self.obj['SeqList']
         if self.getStr('Basefile').lower() in ['','none']:
             self.str['Basefile'] = rje.baseFile(seqlist.getStr('Name'))
             self.obj['DB'].setInfo({'Basefile':self.str['Basefile']})
         ## ~ [1a] Genetic Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         cdb = self.db().addEmptyTable('Code',['Codon','AA'],['Codon'])
         for codon in rje_sequence.genetic_code: cdb.addEntry({'Codon':codon,'AA':rje_sequence.genetic_code[codon]})
         cdb.index('AA')
         ### ~ [2] Calculate Codon Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         codons = rje.sortKeys(rje_sequence.genetic_code)
         db = self.db().addEmptyTable('Codons',['Seq','Len']+codons,['Seq'])
         sx = 0.0; seqx = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#COD','Calculating codon usage: %.2f%%' % (sx/seqx)); sx += 100.0
             entry = rje_sequence.codons(seq.getSequence(),{})
             #self.deBug(entry); self.deBug(entry.values())
             entry['Len'] = sum(entry.values())
             entry['Seq'] = seq.getStr('AccNum')
             db.addEntry(entry)
         self.printLog('\r#COD','Codon usage calculated for %s sequences' % rje.iStr(seqx))
         db.fillBlanks(blank=0,fillempty=True)
         db.saveToFile()
         ### ~ [3] Calculate NT Count Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         nt = ['C','A','G','U']
         for i in [1,2,3]:
             for n in ['C','A','G','U']: nt.append('%s|%d' % (n,i))
         ndb = self.db().addEmptyTable('NT',['Seq','Len']+nt,['Seq'])
         sx = 0.0; seqx = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#NT','Calculating NT Counts: %.2f%%' % (sx/seqx)); sx += 100.0
             entry = rje_sequence.aaFreq(string.replace(seq.getSequence(),'T','U'),{'C':0,'A':0,'G':0,'U':0},False)
             entry['Len'] = sum(entry.values())
             entry['Seq'] = seq.getStr('AccNum')
             centry = db.data(entry['Seq'])
             for i in [1,2,3]:
                 for n in ['C','A','G','U']: entry['%s|%d' % (n,i)] = 0
             for codon in codons:
                 for i in [1,2,3]:
                     n = codon[i-1]
                     entry['%s|%d' % (n,i)] += centry[codon]
             ndb.addEntry(entry)
         self.printLog('\r#NT','NT Counts calculated for %s sequences' % rje.iStr(seqx))
         ndb.saveToFile()
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Пример #6
0
 def depthChargeForker(self):  ### Main DepthCharge forking method
     '''
     Work through each sequence and fork it out for DepthCharge analysis.
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqin = self.seqinObj()
         self.list['ToFork'] = seqin.list['Seq'][0:]
         resfile = '{0}.depthcharge.tdt'.format(self.baseFile())
         if self.force(): rje.backup(resfile, appendable=False)
         elif rje.exists(resfile):
             ddb = self.db().addTable(resfile,
                                      ['seqname', 'start', 'end', 'type'])
             ddb.dataFormat({'start': 'int', 'end': 'int'})
             complete = ddb.indexDataList('type', 'all', 'seqname')
             if complete:
                 cx = 0
                 for seq in self.list['ToFork'][0:]:
                     if seqin.shortName(seq) in complete:
                         self.list['ToFork'].remove(seq)
                         cx += 1
                 if cx:
                     self.printLog(
                         '#SKIP',
                         'Skipping {0} previously processed sequences (force=F)'
                         .format(rje.iStr(cx)))
             if not self.list['ToFork']:
                 self.printLog(
                     '#CHARGE',
                     'All sequences previously processed (force=F)')
                 return ddb
         while len(self.list['Forked']) < self.getNum(
                 'Forks') and self.list['ToFork']:
             self.nextFork()
         ### ~ [2] ~ Work through each sequence and fork out ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.forking()
         self.printLog('#FORK',
                       'Forking of %s jobs completed.' %
                       (rje.iStr(seqin.seqNum())),
                       log=self.getBool('LogFork'))
         ddb = self.db().addTable(resfile,
                                  ['seqname', 'start', 'end', 'type'],
                                  replace=True)
         ddb.dataFormat({'start': 'int', 'end': 'int'})
         return ddb
     except:
         self.errorLog('%s.depthChargeForker error' % self.prog())
Пример #7
0
 def emptyToBlank(self):  ### Replace empty values with 'blank' values
     '''Replace empty values with 'blank' values.'''
     db = self.db('TimePoints')
     bx = 0
     for entry in db.entries():
         for field in db.fields():
             if entry[field] == '':
                 entry[field] = 'blank'
                 bx += 1
     self.printLog(
         '#DB',
         '%s empty values represented with blank values' % rje.iStr(bx))
Пример #8
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         forkx = len(self.list['Forked'])
         self.setup()
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.forking()
         self.printLog('#FORK','Forking of %s jobs completed.' % (rje.iStr(forkx)))
     except:  self.errorLog('Forker.run() Error')
     if self.list['Forked']:
         self.warnLog('%s fork jobs remain unforked.' % rje.iLen(self.list['Forked']))
         return False
     return True
Пример #9
0
 def pileUpFDR(self):  ### Calculates statistics of genetic differences from parsed PileUp Tables
     '''Calculates statistics of genetic differences from parsed PileUp Tables.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         fdrfile = '%s.fdr.tdt' % self.baseFile()
         if not self.force() and os.path.exists(fdrfile): return 
         sigpval = {}    # pval:[fpos]
         npos = 0; nx = 0
         for locus in rje.sortKeys(self.dict['RefSeq']):
             npos += len(self.dict['RefSeq'][locus]) - self.dict['RefSeq'][locus].count('?')
         ### ~ [1] Parse out stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r')
         headers = string.split(SAMSIG.readline()) + ['p.FDR']
         fpos = SAMSIG.tell(); fline = SAMSIG.readline(); px = 0
         while fline:
             self.progLog('\r#SIG','Reading Pvalues: %s p <= 0.05...' % rje.iStr(px))
             try: pval = float(string.split(fline)[-1])
             except: break
             if pval <= 0.05:
                 if pval not in sigpval: sigpval[pval] = []
                 sigpval[pval].append(fpos); px += 1
             fpos = SAMSIG.tell(); fline = SAMSIG.readline()
         self.printLog('\r#SIG','Reading Pvalues complete: %s p <= 0.05.' % rje.iStr(px))
         ### ~ [2] Calculate FDR and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         SAMFDR = open(fdrfile,'w')
         rje.writeDelimit(SAMFDR, headers)
         px = 0; sx = 0.0; stot = len(sigpval)
         for pval in rje.sortKeys(sigpval):
             self.progLog('\r#FDR','Calculating FDR: %.2f%%' % (sx/stot)); sx += 100.0
             px += len(sigpval[pval])
             if pval: fdr = (pval * npos) / px
             else: fdr = 0.0
             for fpos in sigpval[pval]:
                 SAMSIG.seek(fpos)
                 rje.writeDelimit(SAMFDR,rje.readDelimit(SAMSIG.readline())+[rje.expectString(fdr)])
         SAMSIG.close()
         SAMFDR.close()
         self.printLog('\r#FDR','%s FDR lines output to %s' % (rje.iStr(px),fdrfile))
     except: self.errorLog('%s.pileUpFDR() error' % (self)); return None
Пример #10
0
 def saveReadMe(self,
                filename='pydocs.txt',
                append=False):  ### Prints docs for modules to file
     '''
     Prints docs for modules to file.
     >> filename:str = output file name
     >> append:boolean
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pydoc = self.obj['PyDoc']
         if append:
             self.printLog('#DOC', 'Appending docstrings to %s' % filename)
             PYDOC = open(filename, 'a')
         else:
             rje.mkDir(self, filename)
             self.printLog('#DOC', 'Writing docstrings to %s' % filename)
             PYDOC = open(filename, 'w')
             PYDOC.write(self.readMeHeader())
         db = self.db('Module')
         dx = 0
         ### ~ [2] Output Docstrings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for sourcedir in pydoc.list['SourceDir']:
             PYDOC.write('-%s:\n\n' % sourcedir)
             for pyfile in db.dataKeys():
                 entry = db.data(pyfile)
                 module = entry['Module']
                 if not pyfile.find(sourcedir) >= 0 or not os.path.exists(
                         '%s%s%s.py' % (pydoc.getStr('PyPath'),
                                        rje.makePath(sourcedir), module)):
                     continue
                 ## ~ [2a] ~ Module docstring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 mtxt = '### ~~~ Module %s ~ [%s] ~~~ ###' % (module,
                                                              pyfile)
                 while len(mtxt) < 122:
                     mtxt = mtxt[:5] + '~' + mtxt[5:-5] + '~' + mtxt[-5:]
                 try:
                     PYDOC.write('%s\n\n%s\n' % (mtxt, entry['DocString']))
                     dx += 1
                 except:
                     self.errorLog('Cannot write DocString for %s' % module,
                                   printerror=False)
                     PYDOC.write('%s\n\nDocString Error!\n' % (mtxt))
                     dx += 1
             PYDOC.write('\n\n\n')
         PYDOC.close()
         self.printLog(
             '#DOC', 'Output to %s complete: %s modules.' %
             (filename, rje.iStr(dx)))
     except:
         self.errorLog('Error in %s.saveDocs()' % self.prog())
Пример #11
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         forkx = len(self.list['Forked'])
         self.setup()
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.forking()
         self.printLog('#FORK',
                       'Forking of %s jobs completed.' % (rje.iStr(forkx)))
     except:
         self.errorLog('Forker.run() Error')
     if self.list['Forked']:
         self.warnLog('%s fork jobs remain unforked.' %
                      rje.iLen(self.list['Forked']))
         return False
     return True
Пример #12
0
 def batchRun(self,returnobj=False):     ### Execute batch mode runs
     '''Execute batch mode runs.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         barg = self.getStrLC('BatchArg')
         if not barg: raise ValueError('Cannot use batchrun=FILELIST if batcharg=None.')
         batchfiles = self.list['BatchRun'][0:]
         self.list['BatchRun'] = []  # Avoid recursive running!
         blog = self.getStr('BatchLog')
         if not blog.startswith('.'): blog = '.%s' % blog
         if not blog.endswith('.log'): blog = '%s.log' % blog
         rawcmd = self.cmd_list[0:]
         rawlog = self.log
         batchobj = []
         ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         bx = 0
         for bfile in batchfiles:
             bx += 1
             self.printLog('#BATCH','Batch running %s of %s: %s=%s' % (rje.iStr(bx),rje.iLen(batchfiles),barg,bfile))
             ## Setup parameters
             bbase = rje.baseFile(bfile,strip_path=True)
             bcmd = ['%s=%s' % (barg,bfile)]
             if self.getBool('BatchBase'):
                 if blog == '.log': bcmd += ['basefile=%s' % bbase]
                 else: bcmd += ['basefile=%s%s' % (bbase,rje.baseFile(blog))]
             elif self.getStrLC('BatchLog'): bcmd += ['log=%s%s' % (bbase,blog)]
             else: bcmd += ['newlog=F']
             #self.debug(bcmd)
             ## Setup Seqsuite object
             self.cmd_list = rawcmd + bcmd
             self.log = rje.setLog(self.log.obj['Info'],self,self.cmd_list)                 # Sets up Log object for controlling log file output
             ## Run
             batchobj.append(self.run())
             ## Finish and Tidy
             self.log = rawlog
             runobj =  batchobj[-1]
             if runobj:
                 if not returnobj: batchobj[-1] = True
                 info = runobj.log.obj['Info']
                 self.printLog('#RUN','%s V%s run finished.' % (info.program,info.version))
             else: self.warnLog('Batch run failed (%s=%s).' % (barg,bfile))
         ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         failx = batchobj.count(False)
         self.printLog('#BATCH','%s batch runs complete: %s failed.' % (rje.iLen(batchfiles),rje.iStr(failx)))
         self.list['BatchRun'] = batchfiles
         return batchobj
     except: self.errorLog('%s.batchRun error' % self); return False
Пример #13
0
 def expectedCodonUsage(self):     ### Calculate expected codon usage from Frequency data
     '''Calculate expected codon usage from Frequency data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         aacode = self.db('Code').index('AA')
         nt = ['C','A','G','U']; codons = rje.sortKeys(rje_sequence.genetic_code)
         cdb = self.db('Codons'); ndb = self.db('NT')
         nsumdb = self.db().copyTable(ndb,'NTPos',replace=True)
         nsumdb.dropField('Len')
         for n in ['C','A','G','U']: nsumdb.renameField(n,'%s|All' % n)
         nsumdb.reshapeLong('Pos',reshape=['C','A','G','U'])
         nsumdb.compress(['Pos'],{'Pos':'str','Seq':'str'},default='sum')
         nsumdb.dropField('Seq'); nsumdb.addField('Total')
         for entry in nsumdb.entries():
             pos = entry.pop('Pos'); entry.pop('Total')
             rje.dictFreq(entry)
             entry['Pos'] = pos
         nsumdb.saveToFile()
         nexentry = nsumdb.data('3')
         fdb = self.db().addEmptyTable('Freq',['Seq','Len']+nt+codons+['Total'],['Seq'])
         edb = self.db().copyTable(cdb,'Expected',replace=True)
         ### ~ [2] Calculate Frequencies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         x = 0.0; etot = cdb.entryNum()
         for oldentry in cdb.entries():
             self.progLog('\r#FREQ','Calculating Frequencies: %.2f%%' % (x/etot)); x += 100.0
             entry = rje.combineDict({},oldentry)
             seq = entry['Seq']; entry['Total'] = entry.pop('Len')
             exentry = edb.data(seq)
             ntentry = rje.combineDict({},ndb.data()[seq])
             ntentry.pop('Seq'); ntentry.pop('Len')
             rje.dictFreq(ntentry)
             ntentry['Len'] = ntentry.pop('Total')
             for aa in aacode:
                 ax = 0.0; ex = 0.0
                 for codon in aacode[aa]:
                     ax += entry[codon]
                     exentry[codon] = nexentry[codon[0]] * nexentry[codon[1]] * nexentry[codon[2]]
                     ex += exentry[codon]
                 for codon in aacode[aa]:
                     if ax: entry[codon] = len(aacode[aa]) * entry[codon] / ax
                     else: entry[codon] = 0.0
                     exentry[codon] = ax * (exentry[codon] / ex)
             fdb.addEntry(rje.combineDict(entry,ntentry))
         self.printLog('\r#Freq','Frequencies calculated for %s entries' % rje.iStr(etot))
         fdb.saveToFile(); edb.saveToFile()
     except: self.errorLog('%s.expectedCodonUsage error' % self)
Пример #14
0
 def tidyMotifNames(self,
                    dbtable):  ### Tidy the motif names in given dbtable
     '''Tidy the motif names in given dbtable.'''
     try:  ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         slist = self.obj['SLiMList']
         if 'motif' not in dbtable.fields(): return 0
         mx = 0
         for entry in dbtable.entries():
             newname = slist.slimCoreName(entry['motif'])
             if newname != entry['motif']:
                 entry['motif'] = newname
                 mx += 1
         self.printLog(
             '#MOTIF', '%s motif names corrected for SLiMList splitting.' %
             rje.iStr(mx))
         if mx: dbtable.remakeKeys()
         return mx
     except:
         self.errorLog('Problem during %s tidyMotifNames.' % self.prog())
         raise
Пример #15
0
 def run(self,save=True):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.setup(): return False
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.parseMITAB()
         pdb = self.db('pairwise')
         if save:
             pdb.compress(['Hub','Spoke','HubTaxID','SpokeTaxID'],rules={'Evidence':'list','IType':'list'},joinchar='|')
             pdb.dropField('#')
             self.printLog('#PPI','%s unique pairwise PPI (Symmetry=%s)' % (rje.iStr(pdb.entryNum()),self.getBool('Symmetry')))
             pdb.saveToFile()
             pdb.index('Evidence',splitchar='|')
             pdb.indexReport('Evidence','#METHOD')
             pdb.index('IType',splitchar='|')
             pdb.indexReport('IType','#ITYPE')
         return pdb
     except:
         self.errorLog(self.zen())
         raise   # Delete this if method error not terrible
Пример #16
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
         self.db().basefile(self.basefile())
         self.list['Accuracy'] = [0,1.0 - self.getNum('ErrPerBase')]
         ## ~ [1a] SMRTReads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         while self.getStrLC('SMRTUnits') not in ['reads','gb','mb']:
             txt = 'SMRTUnits "%s" not recognised'
             if self.getNum('SMRTReads') < 10: smrtunits = 'Gb'
             elif self.getNum('SMRTReads') > 10000: smrtunits = 'reads'
             else: smrtunits = 'Mb'
             if self.i() < 0 or rje.yesNo('%s: switch to (%s) %s?' % (txt,self.getNum('SMRTReads'),smrtunits)):
                 self.setStr({'SMRTUnits':smrtunits})
             elif self.i() >0: self.setStr({'SMRTUnits':rje.choice('SMRTUnits (reads/Gb/Mb)?')})
             self.printLog('#UNITS','%s => %s' % (txt,self.getStr('SMRTUnits')))
         if self.getStrLC('SMRTUnits') in ['gb','mb']:
             smrttotal = self.getNum('SMRTReads') * {'gb':1e9,'mb':1e6}[self.getStrLC('SMRTUnits')]
             txt =  '%s %s @ %.3f kb/read' % (self.getNum('SMRTReads'),self.getStr('SMRTUnits'),self.getNum('AvRead')/1000.0)
             self.setNum({'SMRTReads':smrttotal/self.getNum('AvRead')})
             txt += ' => %s reads' % rje.iStr(int(self.getNum('SMRTReads')))
             self.printLog('#READS',txt)
         ## ~ [1b] XnList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         xnlist = []
         for xn in self.list['XnList']:
             if xn == '': continue
             try:
                 ixn = int(xn)
                 if xn not in [ixn,'%d' % ixn]: self.printLog('#XN','"%s" -> %dX' % (xn,ixn))
                 if ixn == 0: self.printLog('#XN','No point in 0X output: use 1-%Coverage.')
                 elif ixn == 1: self.printLog('#XN','No point in 1X output: use %Coverage.')
                 else: xnlist.append(ixn)
             except: self.errorLog('Could not process %s as part of XnList. (Integers only.)' % xn)
         xnlist.sort()
         if xnlist: self.printLog('#XN','XnList: %sX.' % string.join(string.split('%s' % xnlist,','),'X, ')[1:-1])
         self.list['XnList'] = xnlist
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self.prog()); return False  # Setup failed
Пример #17
0
 def saveReadMe(self,filename='pydocs.txt',append=False):      ### Prints docs for modules to file
     '''
     Prints docs for modules to file.
     >> filename:str = output file name
     >> append:boolean
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pydoc = self.obj['PyDoc']
         if append:
             self.printLog('#DOC','Appending docstrings to %s' % filename)
             PYDOC = open(filename,'a')
         else:
             rje.mkDir(self,filename)
             self.printLog('#DOC','Writing docstrings to %s' % filename)
             PYDOC = open(filename,'w')
             PYDOC.write(self.readMeHeader())
         db = self.db('Module')
         dx = 0
         ### ~ [2] Output Docstrings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for sourcedir in pydoc.list['SourceDir']:
             PYDOC.write('-%s:\n\n' % sourcedir)
             for pyfile in db.dataKeys():
                 entry = db.data(pyfile)
                 module = entry['Module']
                 if not pyfile.find(sourcedir) >= 0 or not os.path.exists('%s%s%s.py' % (pydoc.getStr('PyPath'),rje.makePath(sourcedir),module)): continue
                 ## ~ [2a] ~ Module docstring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 mtxt = '### ~~~ Module %s ~ [%s] ~~~ ###' % (module,pyfile)
                 while len(mtxt) < 122: mtxt = mtxt[:5] + '~' + mtxt[5:-5] + '~' + mtxt[-5:]
                 try: PYDOC.write('%s\n\n%s\n' % (mtxt,entry['DocString'])); dx += 1
                 except:
                     self.errorLog('Cannot write DocString for %s' % module,printerror=False)
                     PYDOC.write('%s\n\nDocString Error!\n' % (mtxt)); dx += 1
             PYDOC.write('\n\n\n')
         PYDOC.close()
         self.printLog('#DOC','Output to %s complete: %s modules.' % (filename,rje.iStr(dx)))
     except: self.errorLog('Error in %s.saveDocs()' % self.prog())
Пример #18
0
    def inSilicoHybrid(
        self
    ):  ### Filter and combine subreads from parent and output to fasta file.
        '''
        Filter and combine subreads from parent and output to fasta file.

        This module generates balanced "in silico diploid" PacBio subread data from two sequenced haploid parents. Each
        parent must first be run through SMRTSCAPE to generate subread summary data. (This will be performed if missing. Each
        parent needs a `*.fofn` file of subread file names, `*.unique.tdt` unique subreads table and `*.smrt.tdt` SMRT cell
        identifier table.)

        A new set of subreads is then generated from the combined set of parent subreads. This is done by first ranking the
        unique subreads from each parent by length. First, the longest subread from each parent are compared and the shortest
        selected to be the first subread of the diploid. (The shortest is taken to minimise length differences between the
        two parents.) Next, the longest subread from the next parent that is no longer than the previous subread is added.
        This cycles, picking a read from the the parent with fewest cumulative bases each cycle. The longest subread that is
        no longer than the previous subread is selected. This continues until one parent runs out of subreads. Additional
        subreads will be added from the other parent if they reduce the difference in cumulative output for each parent.

        Final output will be a `*.subreads.fasta` file in which each parent has a similar total sequence content and for
        which the subread length distributions should also be similar. This is to overcome biases in resulting diploid
        assemblies, where one parent has higher quality data than the other.

        NOTE: If performing downstream filtering by Read Quality (RQ), this might reintroduce a bias if one parent has much
        higher RQ values than the other. The `rqfilter=X` setting can therefore be used to restrict output to  reads with a
        minimum RQ value. By default this is 0.84. If you do not get enough sequence output, this setting may need to be
        relaxed.
        '''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [0a] Parent 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 1 ~~~~~~~~~~~~~~~~~~~~ #')
            self.printLog('#FOFN', 'Parent1: %s' % self.getStr('Parent1'))
            base1 = rje.baseFile(self.getStr('Parent1'))
            parent1 = smrtscape.SMRTSCAPE(
                self.log, ['genomesize=13.1e6'] + self.cmd_list +
                ['batch=%s' % self.getStr('Parent1'),
                 'basefile=%s' % base1])
            parent1.setup()
            udb1 = parent1.udb()
            cdb = parent1.db('smrt', add=True, mainkeys=['Name'])
            cdb.dataFormat({'SMRT': 'int'})
            cx = cdb.entryNum()
            ## ~ [0a] Parent 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 2 ~~~~~~~~~~~~~~~~~~~~ #')
            self.printLog('#FOFN', 'Parent2: %s' % self.getStr('Parent2'))
            base2 = rje.baseFile(self.getStr('Parent2'))
            parent2 = smrtscape.SMRTSCAPE(
                self.log, ['genomesize=13.1e6'] + self.cmd_list +
                ['batch=%s' % self.getStr('Parent2'),
                 'basefile=%s' % base2])
            parent2.setup()
            udb2 = parent2.udb()
            cdb2 = parent2.db('smrt', add=True, mainkeys=['Name'])
            cdb2.dataFormat({'SMRT': 'int'})
            # Shift all of the Parent2 SMRT IDs to avoid conflict with Parent1
            for entry in cdb2.entries() + udb2.entries():
                entry['SMRT'] = entry['SMRT'] + cx
            cdb = parent1.db().mergeTables(cdb, cdb2)
            ## ~ [0c] Output Sequence File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ DIPLOIDOCUS SUBREADS ~~~~~~~~~~~~~~~~~~~~ #'
            )
            minlen = self.getInt('LenFilter')
            minrq = self.getNum('RQFilter')
            rqstr = '%s' % minrq
            filtfile = '%s.L%sRQ%s.fasta' % (self.baseFile(), minlen,
                                             rqstr[2:])
            ## ~ [0d] Input Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqbatch = []  # List of SeqList objects
            self.printLog(
                '#BATCH', '%s sequence files to process.' %
                rje.iLen(parent1.list['Batch'] + parent2.list['Batch']))
            for seqfile in parent1.list['Batch'] + parent2.list['Batch']:
                seqcmd = self.cmd_list + [
                    'seqmode=file', 'autoload=T', 'summarise=F',
                    'seqin=%s' % seqfile, 'autofilter=F'
                ]
                seqbatch.append(rje_seqlist.SeqList(self.log, seqcmd))
            self.printLog(
                '#BATCH',
                '%s sequence files to summarise.' % rje.iLen(seqbatch))
            if not seqbatch:
                raise IOError(
                    'No batch input fasta files found! Make sure parentN=FILE settings given *.fofn.'
                )
            ## ~ [0e] Setup subread lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            elists = [
                udb1.sortedEntries('Len', reverse=True),
                udb2.sortedEntries('Len', reverse=True)
            ]
            plen = [0, 0]  # Summed lengths for each parent
            pseq = [0, 0]  # Total sequence number for each parent
            prq = [0, 0]  # Total sequence RQ for each parent (convert to mean)
            if not elists[0] or not elists[1]:
                raise ValueError(
                    'No Unique ZMW subreads for one or both parents!')
            lastlen = max(elists[0][0]['Len'],
                          elists[1][0]['Len'])  # Length of last selected read
            for elist in elists:
                while elist and elist[0]['RQ'] < minrq:
                    elist.pop(0)
            if not elists[0] or not elists[1]:
                raise ValueError(
                    'No Unique ZMW subreads for one or both parents!')
            nextp = 0  # Index of next parent to use
            if elists[0][0]['Len'] < elists[1][0]['Len']: nextp = 1

            ### ~ [1] Filter and Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Filter Unique Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            zmwlist = []  # List of (smrt,zmw) meeting filtering criteria
            ux = 0.0
            utot = len(elists[0]) + len(elists[1])
            while lastlen:
                self.progLog('\r#DIP',
                             'Diploidising subreads: %.2f%%' % (ux / utot))
                elist = elists[nextp]
                while elist and elist[0]['RQ'] < minrq:
                    elist.pop(0)
                    ux += 100.0
                if elist and elist[0]['Len'] < minlen:
                    ux += 100.0 * len(elist)
                    elist = []
                if not elist:
                    nextp = 1 - nextp
                    break  # Finish
                entry = elist.pop(0)
                ux += 100.0
                zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos']))
                plen[nextp] += entry['Len']
                prq[nextp] += entry['RQ']
                pseq[nextp] += 1
                if plen[1 - nextp] <= plen[nextp]: nextp = 1 - nextp
                lastlen = entry['Len']
            ## ~ [1b] Final processing of last reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            while elists[nextp]:
                elist = elists[nextp]
                while elist and elist[0]['RQ'] < minrq:
                    self.progLog('\r#DIP',
                                 'Diploidising subreads: %.2f%%' % (ux / utot))
                    elist.pop(0)
                    ux += 100.0
                while elist and elist[0]['Len'] >= minlen:
                    self.progLog('\r#DIP',
                                 'Diploidising subreads: %.2f%%' % (ux / utot))
                    entry = elist.pop(0)
                    ux += 100.0
                    pdiff = rje.modulus(plen[0] - plen[1])
                    ediff = rje.modulus(plen[nextp] + entry['Len'] -
                                        plen[1 - nextp])
                    if ediff >= pdiff:
                        elists[nextp] = []
                        break  #Finish!
                    zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos']))
                    plen[nextp] += entry['Len']
                    prq[nextp] += entry['RQ']
                    pseq[nextp] += 1
            self.printLog(
                '\r#DIP',
                'Diploidising subreads complete: %s subreads to output.' %
                rje.iLen(zmwlist))
            self.printLog(
                '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' %
                (self.getStr('Parent1'), rje.iStr(pseq[0]), rje.iStr(plen[0]),
                 1.0 * plen[0] / self.getInt('GenomeSize'), prq[0] / pseq[0]))
            self.printLog(
                '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' %
                (self.getStr('Parent2'), rje.iStr(pseq[1]), rje.iStr(plen[1]),
                 1.0 * plen[1] / self.getInt('GenomeSize'), prq[1] / pseq[1]))
            ## ~ [1b] Extract Filtered Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            rje.backup(self, filtfile)
            SEQOUT = open(filtfile, 'w')
            sx = 0.0
            stot = 0
            sn = len(seqbatch)
            fx = 0
            for seqlist in seqbatch:
                #>m150625_001530_42272_c100792502550000001823157609091582_s1_p0/9/0_3967 RQ=0.784
                si = 100.0 / seqlist.seqNum()
                stot += seqlist.seqNum()
                for seq in seqlist.seqs():
                    self.progLog('\r#OUT',
                                 'Extracting subreads: %.2f%%' % (sx / sn))
                    sx += si
                    (name, sequence) = seqlist.getSeq(seq)
                    try:
                        [smrt, zmw, pos,
                         rq] = string.split(string.replace(name, '/', ' '))
                    except:
                        [smrt, zmw,
                         pos] = string.split(string.replace(name, '/', ' '))
                        rq = minrq
                    if (cdb.data(smrt)['SMRT'], int(zmw), pos) not in zmwlist:
                        continue
                    SEQOUT.write('>%s\n%s\n' % (name, sequence))
                    fx += 1
            self.printLog(
                '\r#OUT',
                'Saved %s filtered subreads to %s.' % (rje.iStr(fx), filtfile))

            ### ~ [2] Summarise Filtered File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seqcmd = self.cmd_list + [
                'seqmode=file', 'autoload=T', 'summarise=T',
                'seqin=%s' % filtfile, 'autofilter=F'
            ]
            rje_seqlist.SeqList(self.log, seqcmd)

            return True
        except:
            self.errorLog('%s.run error' % self.prog())
            return False
Пример #19
0
    def filterSPCode(self):     ### Filters species codes according to mincount and shared taxa at different levels.
        '''Filters species codes according to mincount and shared taxa at different levels.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            db = self.db()
            tax = self.obj['Taxonomy']
            ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            specdb = self.db('spcode')
            parents = {}    # Dictionary of {spcode:parents}
            taxsum = {}
            # Reduced according to low abundance and/or higher level taxa of species in clade
            fx = 0; bfx = 0; ufx = 0
            for ekey in specdb.dataKeys():
                entry = specdb.data(ekey)
                if entry['spcode'] == 'None':
                    entry['boot'] = self.getNum('NoneBoot')
                    continue
                if entry['boot'] < self.getNum('BootFilter'):
                    self.printLog('#FILT','%s: filtered -> "Uncertain" (bootstrap %s < bootfilter=%s).' % (entry['protein'],entry['boot'],self.getNum('BootFilter')))
                    entry['spcode'] = 'Uncertain'; bfx += 1
                    continue
                #self.debug(entry)
                spcodes = string.split(entry['spcode'],'|')
                for spcode in spcodes[0:]:
                    if spcode not in parents:
                        parents[spcode] = []
                        try: taxid = tax.mapToTaxID(spcode,nodeonly=True,warn=False)[0]
                        except: continue
                        while taxid in tax.dict['Parent']:
                            taxid = tax.dict['Parent'][taxid]
                            parsp = tax.getSpCode(taxid,invent=False,warn=False)
                            if parsp: parents[spcode].append(parsp)
                    if not parents[spcode] and len(spcodes) > 1:
                        self.printLog('#FILT','%s: filtered unmapped spcode %s.' % (entry['protein'],spcode))
                        spcodes.remove(spcode); ufx += 1
                    for parsp in parents[spcode]:
                        if parsp in spcodes:
                            self.printLog('#FILT','%s: filtered %s as parent of %s.' % (entry['protein'],parsp,spcode))
                            spcodes.remove(parsp); fx += 1
                for taxon in spcodes[0:]:
                    if taxon not in taxsum: taxsum[taxon] = 0.0
                    if self.getBool('BootWeight'): taxweight = entry['boot']
                    else: taxweight = 1.0
                    taxsum[taxon] += taxweight / len(spcodes)
                entry['spcode'] = string.join(spcodes,'|')
            self.printLog('#FILT','Filtered %s species codes with co-occurring child taxa' % rje.iStr(fx))
            self.printLog('#FILT','Filtered %s unmapped species codes with co-occurring mapped taxa' % rje.iStr(ufx))
            if self.getNum('BootFilter') > 0.0: self.printLog('#FILT','Filtered %s proteins with bootstrap < bootfilter=%s' % (rje.iStr(bfx),self.getNum('BootFilter')))
                #self.debug(entry)

            fx = 0
            for ekey in specdb.dataKeys():
                entry = specdb.data(ekey)
                if entry['spcode'] in ['None','Uncertain']: continue
                #self.debug(entry)
                spcodes = string.split(entry['spcode'],'|')
                for spcode in spcodes[0:]:
                    if self.getNum('MinScore') > 0 and self.getNum('MinScore') > taxsum[spcode]:
                        self.printLog('#FILT','%s: filtered %s < minscore=%s.' % (entry['protein'],spcode,self.getNum('MinScore')))
                        spcodes.remove(spcode); fx += 1
                if spcodes: entry['spcode'] = string.join(spcodes,'|')
                else: self.printLog('#FILT','%s filter aborted: no spcode left!' % (entry['protein']))
                #self.debug(entry)
            self.printLog('#FILT','Filtered %s species codes failing to meet minscore=%s.' % (rje.iStr(fx),self.getNum('MinScore')))

        except: self.errorLog('%s.filterSPCode error' % self.prog())
Пример #20
0
    def taxaMap(self):      ### Maps species codes onto different taxonomic ranks.
        '''Maps species codes onto different taxonomic ranks.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            db = self.db()
            tax = self.obj['Taxonomy']
            ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            specdb = self.db('spcode')
            #descdb = self.db('protdesc')
            ranks = ['genus','family','order','class','phylum']
            rankmap = {}    # SPCODE to Taxon dictionary
            rankfields = ['protein']+ranks+specdb.fields()[1:]
            #if descdb: rankfields.append('desc')
            if self.getStrLC('ProtDesc'):
                rankfields.append('desc'); px = 0
                for prot in self.dict['ProtDesc']:
                    if prot.lower() in ['','protein','gene']: continue
                    pentry = {'protein':prot,'spcode':'None','boot':self.getNum('NoneBoot')}
                    pkey = specdb.makeKey(pentry)
                    if pkey not in specdb.dataKeys(): specdb.addEntry(pentry); px += 1
                self.printLog('#PROT','Added %s proteins from %s without trees.' % (rje.iStr(px),self.getStr('ProtDesc')))
            rankdb = db.addEmptyTable('taxamap',rankfields,['protein'])
            for rank in ranks: rankmap[rank] = {'None':'None','Unmapped':'Unmapped','Uncertain':'Uncertain'}
            taxdb = db.addEmptyTable('taxa',['spcode','taxid','name']+ranks,['spcode'])

            sx = 0.0; stot = specdb.entryNum()
            for entry in specdb.entries():
                self.progLog('\r#SPEC','Processing species: %.2f%%' % (sx/stot)); sx += 100.0
                #if descdb:
                    #try: entry['desc'] = descdb.data(descdb.makeKey(entry))['description']
                try: entry['desc'] = self.dict['ProtDesc'][entry['protein']]
                except: entry['desc'] = ''
                for spcode in string.split(entry['spcode'],'|'):
                    if spcode in rankmap['genus']: continue
                    tentry = {'spcode':spcode}
                    try:
                        taxid = tax.mapToTaxID(spcode,nodeonly=True,warn=False)[0]
                        rank = tax.dict['Rank'][taxid]
                        tentry['taxid'] = taxid
                        tentry['name'] = tax.getSpecies(taxid)
                    except:
                        self.warnLog('Unable to map species code "%s" to TaxID -> "Unmapped"' % spcode)
                        taxid = 'Unmapped'
                        rank = 'genus'
                    # Loop through different ranks
                    for ri in range(len(ranks)):
                        nextrank = ranks[ri]
                        while rank not in ranks[ri:] and taxid in tax.dict['Parent']:
                            taxid = tax.dict['Parent'][taxid]
                            rank = tax.dict['Rank'][taxid]
                            #self.debug('%s: %s' % (tax.dict['Rank'][taxid],tax.getSpecies(taxid)))
                        if taxid in tax.dict['Parent']: taxon = tax.getSpecies(taxid)
                        else: taxon = 'Unmapped'
                        if rank != nextrank:
                            if self.getBool('Monophyly'): taxon = 'Uncertain'
                            else: taxon = '%s %s.' % (taxon,nextrank[:3])
                        rankmap[nextrank][spcode] = taxon
                        tentry[nextrank] = taxon
                    taxdb.addEntry(tentry)
                rentry = {}
                for nextrank in ranks:
                    taxa = []
                    unmapped = ''
                    for spcode in string.split(entry['spcode'],'|'):
                        ranktax = rankmap[nextrank][spcode]
                        if 'unmapped' in ranktax.lower() and ranktax not in taxa:
                            if unmapped: self.warnLog('Two Unmapped %s taxa: %s & %s' % (nextrank,unmapped,ranktax))
                            unmapped = ranktax   #i# Should only be one
                        if ranktax not in taxa: taxa.append(ranktax)
                    if len(taxa) > 1 and 'None' in taxa:
                        self.warnLog('None in: %s' % string.join(rje.sortUnique(taxa),'|'))
                        taxa.remove('None')
                    if len(taxa) > 1 and unmapped: taxa.remove(unmapped)
                    if len(taxa) > 1 and self.getBool('Monophyly'): rentry[nextrank] = 'Uncertain'
                    else: rentry[nextrank] = string.join(rje.sortUnique(taxa),'|')
                rankdb.addEntry(rje.combineDict(rentry,entry))
            self.printLog('\r#SPEC','%s proteins with species codes processed.' % rje.iStr(stot))
            rankdb.saveToFile()
            taxdb.saveToFile()
        except: self.errorLog('%s.taxaMap error' %  self.prog())
Пример #21
0
 def topTerms(self,slimx=20,parents=False,total='Total',countkey='counts'):  ### Selects top terms for GO slim set
     '''
     Selects top terms for GO slim set.
     >> slimx:int [20] = Desired min. number of terms for each GO domain.
     >> parents:bool [False] = Whether parents and children both allowed in list
     >> total:str ['Total'] = Sample containing Total counts for assessment
     >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample
     - self.go(id)[countkey] = {Sample:count}
     << returns a list of GO IDs that meet criteria
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         #x#self.opt['DeBug'] = True
         terms = []                          # List of terms
         dom = {'cc':{},'bp':{},'mf':{}}     # Dictionary of {domain:{count:[IDs]}}
         for id in self.go():
             n = self.go(id)[countkey][total]
             type = self.go(id)['type']
             if n not in dom[type]: dom[type][n] = [id]
             else: dom[type][n].append(id)
         ### ~ [2] ~ Generate Top Terms ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.deBug(dom)
         for type in dom:
             dterms = []                     # Terms for this domain only
             dkeys = rje.sortKeys(dom[type]) # Counts, low to high
             dkeys.reverse()                 # Counts, high to low
             (dx,dtot) = (0.0,len(dkeys))
             while dkeys and len(dterms) < slimx: # Keep looping
                 self.deBug('%s: %s' % (type,dterms))
                 self.progLog('#TOP','Generating top %d %s terms: %.1f%%' % (slimx,type,dx/dtot))
                 dx += 100.0
                 n = dkeys.pop(0)            # Remove from list
                 dterms += dom[type][n]      # Add terms to term list
                 if parents: continue        # Don't care if parents and children all mixed up
                 for id in dterms[0:]:
                     if id not in dterms: continue               # Previously-removed parent
                     for par in self.parents(id):                # Check all parents
                         if par in dterms: dterms.remove(par)    # Remove parent term
             self.printLog('\r#TOP','Identified %s top %s terms: >= %s genes' % (rje.iLen(dterms),type,rje.iStr(n)))
             terms += dterms                 # Found a stable list of terms
         self.deBug(terms)
         return terms
     except: self.errorLog('Major problem with GO.topTerms()')
     return []
Пример #22
0
 def gopher(self):  ### Sets up data for GOPHER run
     '''Sets up data for GOPHER run.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.mkDir(self, 'BLAST/')
         rje_blast.BLASTRun(self.log, self.cmd_list).formatDB(
             fasfile='%s.ygob.fas' % self.info['Basefile'],
             protein=True,
             force=False)
         rje_blast.BLASTRun(self.log, self.cmd_list).formatDB(
             fasfile='%s.yeast.fas' % self.info['Basefile'],
             protein=True,
             force=False)
         seqdict = self.obj['SeqList'].seqNameDic('AccNum')
         ymap = self.dict['PillarMap'] = {}
         ### ~ [2] Convert Pillars to BLAST IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (px, ptot) = (0.0, len(self.list['Pillars']))
         ox = 0
         for pillar in self.list['Pillars']:
             self.progLog(
                 '\r#YGOB',
                 'Converting YGOB Pillars for GOPHER: %.2f%%' % (px / ptot))
             px += 100
             newpillar = []
             for yid in pillar:
                 seq = rje_sequence.Sequence(self.log, self.cmd_list)
                 seq.opt['Yeast'] = True
                 #self.deBug(yid)
                 seq.info['Name'] = yid
                 seq.extractDetails(gnspacc=True)
                 #self.deBug(seq.info)
                 ygob = seq.info['AccNum']
                 if ygob in self.dict['Rename']:
                     acc = self.dict['Rename'][ygob]
                 else:
                     acc = ygob
                 ymap[yid] = acc
                 if acc not in seqdict:
                     self.printLog(
                         '\r#GENE',
                         'Non-coding gene %s (%s)? Cannot find in fasta file'
                         % (acc, yid))
                     continue
                 try:
                     newpillar.append(seqdict[acc].shortName())
                 except:
                     print yid, ygob, acc
                     self.errorLog(rje_zen.Zen().wisdom())
             if not newpillar: continue
             for ygob in pillar:
                 acc = ymap[ygob]
                 if acc not in seqdict: continue
                 if acc in self.list['YeastSeq'] or (
                         not self.list['YeastSeq']
                         and seqdict[acc].info['SpecCode'] == 'YEAST'):
                     open(
                         rje.makePath('BLAST/%s.blast.id' % acc,
                                      wholepath=True),
                         'w').write(string.join(newpillar, '\n'))
                     ox += 1
         self.progLog(
             '\r#YGOB',
             'Converted YGOB Pillars for GOPHER: %s BLAST ID files.' %
             rje.iStr(ox))
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Пример #23
0
    def alignmentToLocal(self,alignment=[],protqry=False):    ### Converts alignment into local hits table
        '''
        Converts alignment into local hits table.
        >> alignment:list of alignment text strings parsed from exonerate output.
        >> protqry:bool[False] = Whether query is protein
        << returns local database table.
        '''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            vfields = ['Qry','Hit','AlnID','Score','Expect','Length','Identity','Positives','QryStart','QryEnd','HitStart','HitEnd','QrySeq','HitSeq','AlnSeq','Rank','Phase','HitStrand']
            vdb = self.db().addEmptyTable('local',vfields,['Qry','Hit','AlnID'])

            ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            '''
                     Query: FAXD1_NOTSC (P82807) Venom prothrombin activator notecarin-D1 [Notechis scutatus scutatus]
                    Target: ahap_PSETE__EBS10XV2AHAP187 haploidB edges=694320..157489 left=833615 right=281503 ver=1.9 style=4:[revcomp]
                     Model: protein2genome:local
                 Raw score: 1170
               Query range: 19 -> 295
              Target range: 12312786 -> 12307250
            
                   20 : AlaGluSerAsnValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg :       37
                        ..!...|||   ||||||||||||||||||||||||||||||||||||||||||
                        CysSerSerLeuValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg
             12312786 : TGTTCTTCTTTAGTATTCTTAAAAAGCAAAGTGGCAAATAGATTTTTGCAAAGA : 12312735
            
                  264 : {G}  >>>> Target Intron 7 >>>>  {ly}GluIleAspIleSerArg :      270
                        {|}           1304 bp           {||}|||||||||||||||!!!
                        {G}++                         ++{ly}GluIleAspIleSerSer
             12308652 : {G}gt.........................ag{GG}GAAATAGACATATCAAGC : 12307328
            
                  289 : ValProProAsnTyrTyrTyr :      295
                        |||||| !!!..||| !!|||
                        ValProAlaThrTyrAspTyr
             12307273 : GTTCCTGCCACGTATGACTAT : 12307251
            '''
            qry = None
            hit = None
            alnx = {}
            ventry = {}
            parsing = alignment[0:]
            rank = 1

            while parsing:
                line = parsing.pop(0)
                #self.bugPrint(line)
                # Query
                if rje.matchExp('Query: (\S+)',line):
                    if ventry: vdb.addEntry(ventry)
                    ventry = {'Qry':rje.matchExp('Query: (\S+)',line)[0],'QrySeq':'','HitSeq':'','AlnSeq':'','Rank':rank}
                    rank += 1
                # Hit
                if rje.matchExp('Target: (\S+)',line):
                    ventry['Hit'] = rje.matchExp('Target: (\S+)',line)[0]
                    qh = (ventry['Qry'],ventry['Hit'])
                    if qh in alnx: alnx[qh] += 1
                    else: alnx[qh] = 1
                    ventry['AlnID'] = alnx[qh]
                # Score
                if rje.matchExp('core: (\S+)',line):
                    ventry['Score'] = int(rje.matchExp('core: (\S+)',line)[0])
                # Alignment
                if rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line):
                    adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line)
                    #self.bugPrint('= new aln: %s ->  %s' % (adata[0],adata[2]))
                    start = int(adata[0])
                    end = int(adata[2])
                    aln = adata[1]
                    x = line.find(aln)
                    if 'QryStart' not in ventry: ventry['QryStart'] = start
                    ventry['QryEnd'] = end
                    ventry['QrySeq'] += aln
                    #self.bugPrint('^%s$' % ventry['QrySeq'])

                    line = parsing.pop(0)
                    #self.bugPrint(line)
                    #self.bugPrint(']%s[' % aln)
                    #self.bugPrint(']%s[' % line[x:x+len(aln)])
                    ventry['AlnSeq'] += line[x:x+len(aln)]
                    #self.debug('^%s$' % ventry['AlnSeq'])

                    #self.bugPrint(parsing[0])
                    adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0))
                    if not adata:
                        #self.deBug(parsing[0])
                        adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0))
                    if not adata: raise ValueError('Partial alignment! Truncated output?')
                    #self.bugPrint('+ hit aln: %s ->  %s' % (adata[0],adata[2]))
                    start = int(adata[0])
                    end = int(adata[2])
                    aln = adata[1]
                    if 'HitStart' not in ventry: ventry['HitStart'] = start
                    ventry['HitEnd'] = end
                    ventry['HitSeq'] += aln
            if ventry: vdb.addEntry(ventry)
            ## Seq Check
            for ventry in vdb.entries():
                #self.bugPrint('^%s$' % ventry['QrySeq'])
                #self.bugPrint('^%s$' % ventry['AlnSeq'])
                #self.bugPrint('^%s$' % ventry['HitSeq'])
                if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']):
                    self.debug(ventry)
                    raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq'])))

            ### ~ [3] Split on introns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.obj['DNAHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F','dna=T'])
            self.obj['ProtHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F'])

            #i# Protein Position Conversion
            if protqry:
                for ventry in vdb.entries():
                    # 1->1, 2->4, 3->7 = 1+3*(n-1)
                    ventry['QryStart'] = 1+3*(ventry['QryStart']-1)
                    if ventry['QrySeq'].startswith('{'):
                        codend = ventry['QrySeq'].find('}')
                        # {X} = phase 2, find = 2
                        if codend == 2: ventry['QryStart'] += 2
                        # {XX} = phase 1, find = 3
                        elif codend == 3: ventry['QryStart'] += 1
                        else: raise ValueError('QrySeq {} bracket mismatch!: %s' % ventry)
                    ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1

            vdb.newKey(['Qry','Rank','Hit','AlnID'])
            for vkey in vdb.dataKeys():
                ventry = vdb.data(vkey)
                #i# Make a combined hitseq to output to fasta
                #># phap_PSETE__EBS10XV2PHAP187.FAXD1_NOTSC.XXX
                hitname = '%s.ex%s %s %s-%s' % (ventry['Qry'],ventry['Rank'],ventry['Hit'],rje.iStr(ventry['HitStart']),rje.iStr(ventry['HitEnd']))
                hitseq = ''
                phase = (ventry['QryStart'] + 2) % 3
                alnx = 1
                vkeyentries = [ventry]
                dirn = 1
                if ventry['HitEnd'] < ventry['HitStart']:
                    dirn = -1
                    ventry['HitStrand'] = '-'
                else: ventry['HitStrand'] = '+'
                for seq in ['HitSeq','QrySeq','AlnSeq']:
                    ventry[seq] = string.replace(ventry[seq],'}','')
                    ventry[seq] = string.replace(ventry[seq],'{','')
                while rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq']):
                    intron = rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq'])[0]
                    x = ventry['QrySeq'].find(intron)
                    y = x + len(intron)
                    intronlen = int(rje.matchExp('(\d+) bp',ventry['AlnSeq'][x:y])[0])
                    #i# Create a new entry of the first exon
                    newentry = rje.combineDict({},ventry)
                    for seq in ['HitSeq','QrySeq','AlnSeq']:
                        newentry[seq] = newentry[seq][:x]
                    newentry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx); alnx += 1
                    newentry['QryEnd'] = newentry['QryStart'] + len(newentry['QrySeq']) - string.count(newentry['QrySeq'],'-') - 1
                    newentry['HitEnd'] = newentry['HitStart'] + (len(newentry['HitSeq']) - string.count(newentry['HitSeq'],'-') - 1) * dirn
                    newentry['Length'] = x
                    newentry['Identity'] = string.count(newentry['AlnSeq'],'|')
                    vkeyentries.append(vdb.addEntry(newentry))
                    hitseq += newentry['HitSeq']
                    #i# Update ventry to be the rest of the hit
                    for seq in ['HitSeq','QrySeq','AlnSeq']:
                        ventry[seq] = ventry[seq][y:]
                    ventry['QryStart'] = newentry['QryEnd'] + 1
                    if protqry: ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1
                    ventry['HitStart'] = newentry['HitEnd'] + intronlen * dirn
                #i# Calculate length and identity of final exon
                ventry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx)
                ventry['Length'] = len(ventry['AlnSeq'])
                ventry['Identity'] = string.count(ventry['AlnSeq'],'|')
                #i# Add sequence hits
                hitname += ' (%d alignment blocks)' % alnx
                hitseq += ventry['HitSeq']
                hitseq = string.replace(hitseq,'-','')
                protseq = rje_sequence.dna2prot('%s%s' % ('N' * phase,hitseq))
                self.obj['ProtHits']._addSeq(hitname,protseq)
                if ventry['HitStart'] > ventry['HitEnd']: hitseq = rje_sequence.reverseComplement(hitseq)
                self.obj['DNAHits']._addSeq(hitname,hitseq)

                #i# Update AlnID for proper float sorting
                for ventry in vkeyentries:
                    (vcore,vx) = string.split(ventry['AlnID'],'.')
                    ventry['AlnID'] = '%s.%s' % (vcore,rje.preZero(int(vx),alnx))
                    #self.debug(ventry)
            vdb.dataFormat({'AlnID':'string'})
            vdb.remakeKeys()
            self.debug(vdb.dataKeys())

            ## Seq Check
            for ventry in vdb.entries():
                #self.bugPrint('^%s$' % ventry['QrySeq'])
                #self.bugPrint('^%s$' % ventry['AlnSeq'])
                #self.bugPrint('^%s$\n' % ventry['HitSeq'])
                if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']):
                    self.debug(ventry)
                    raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq'])))

            udb = self.reduceLocal(byqry=True)
            udb.rename('unique')
            udb.newKey(['Qry','Rank','Hit','AlnID'])
            self.debug(vdb.dataKeys())

            #i# Calculate exon phase
            for ventry in vdb.entries() + udb.entries(): ventry['Phase'] = (ventry['QryStart'] - 1) % 3

            #i# Protein Position Conversion
            if protqry:
                for ventry in vdb.entries():
                    ventry['QryStart'] = (ventry['QryStart']+2)/3
                    ventry['QryEnd'] = (ventry['QryEnd']+2)/3
                for ventry in udb.entries():
                    ventry['QryStart'] = (ventry['QryStart']+2)/3
                    ventry['QryEnd'] = (ventry['QryEnd']+2)/3

            #vdb.remakeKeys()
            return vdb

        except: self.errorLog('%s.alignmentToLocal error' % self.prog()); raise
Пример #24
0
 def taxDict(self,taxid,store=False,skipuni=False):    ### Extracts taxonomy details from SpecFile for taxid
     '''Extracts taxonomy details from SpecFile for taxid. If taxid is a list, will process each element.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxdict = {}
         ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tlist = True
         try: taxid.sort()
         except: tlist = False
         if tlist:
             tx = 0.0; ttot = len(taxid); mx = 0
             for t in taxid:
                 self.progLog('\r#SPEC','Extracting Uniprot species details: %.1f%%' % (tx/ttot)); tx += 100.0
                 taxdict[t] = self.taxDict(t,store)
                 if not taxdict[t]: mx += 1
             self.printLog('\r#SPEC','Extracted Uniprot/NCBI species details for %s TaxID: %s missing' % (rje.iStr(ttot),rje.iStr(mx)))
             return taxdict
         ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxid = '%s' % taxid
         if taxid in self.dict['TaxDict']: return self.dict['TaxDict'][taxid]
         if not skipuni:
             greplines = os.popen('grep -A 1 " %s:" %s' % (taxid, self.getStr('SpecFile'))).readlines()
             for entry in greplines:
                 nmatch = rje.matchExp('^(\S+)\s+\S+\s+(\d+):\s+N=(\S.+)\s*$',entry)
                 if nmatch and nmatch[1] != taxid: break # Next taxon
                 if nmatch: taxdict['spcode'] = nmatch[0]; taxdict['name'] = nmatch[2]
                 elif rje.matchExp('C=(\S.+)\s*$',entry): taxdict['common'] = rje.matchExp('C=(\S.+)\s*$',entry)[0]
         #if not taxdict and taxid in self.list['RankID']: self.warnLog('Cannot find TaxID "%s" in %s!' % (taxid,self.getStr('SpecFile')),'Missing_TaxID',suppress=True)
         ## ~ [2b] ~ Adding missing scientific names from NameMap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not taxdict:
             for entry in os.popen('grep -i -e "^%s\t" %s' % (taxid, self.getStr('NameMap'))).readlines():
                 tdata = string.split(entry,'\t|\t')
                 if not tdata[3].startswith('scientific name'): continue
                 tname = tdata[1]
                 if 'name' in taxdict: self.warnLog('TaxID %d hits "%s" and "%s"!' % (taxid, taxdict[name],tname))
                 else: taxdict['name'] = tname
         return taxdict
     except: self.errorLog('%s.taxDict() error' % (self)); raise
Пример #25
0
 def gopher(self):  ### Sets up data for GOPHER run
     '''Sets up data for GOPHER run.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.mkDir(self,'BLAST/')
         rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fasfile='%s.ygob.fas' % self.info['Basefile'],protein=True,force=False)
         rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fasfile='%s.yeast.fas' % self.info['Basefile'],protein=True,force=False)
         seqdict = self.obj['SeqList'].seqNameDic('AccNum')
         ymap = self.dict['PillarMap'] = {}
         ### ~ [2] Convert Pillars to BLAST IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (px,ptot) = (0.0,len(self.list['Pillars'])); ox = 0
         for pillar in self.list['Pillars']:
             self.progLog('\r#YGOB','Converting YGOB Pillars for GOPHER: %.2f%%' % (px/ptot)); px += 100
             newpillar = []
             for yid in pillar:
                 seq = rje_sequence.Sequence(self.log,self.cmd_list)
                 seq.opt['Yeast'] = True
                 #self.deBug(yid)
                 seq.info['Name'] = yid
                 seq.extractDetails(gnspacc=True)
                 #self.deBug(seq.info)
                 ygob = seq.info['AccNum']
                 if ygob in self.dict['Rename']: acc = self.dict['Rename'][ygob]
                 else: acc = ygob
                 ymap[yid] = acc
                 if acc not in seqdict: self.printLog('\r#GENE','Non-coding gene %s (%s)? Cannot find in fasta file' % (acc,yid)); continue
                 try:
                     newpillar.append(seqdict[acc].shortName())
                 except:
                     print yid, ygob, acc
                     self.errorLog(rje_zen.Zen().wisdom())
             if not newpillar: continue
             for ygob in pillar:
                 acc = ymap[ygob]
                 if acc not in seqdict: continue
                 if acc in self.list['YeastSeq'] or (not self.list['YeastSeq'] and seqdict[acc].info['SpecCode'] == 'YEAST'):
                     open(rje.makePath('BLAST/%s.blast.id' % acc,wholepath=True),'w').write(string.join(newpillar,'\n'))
                     ox += 1
         self.progLog('\r#YGOB','Converted YGOB Pillars for GOPHER: %s BLAST ID files.' % rje.iStr(ox))
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
Пример #26
0
 def sgd2sp(self):   ### Reformats yeast sequence names and outputs new data for GOPHER
     '''Reformats yeast sequence names and outputs new data for GOPHER.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         inseq = self.obj['SeqList']
         uni = rje_uniprot.UniProt(self.log,self.cmd_list+['datout=None'])
         xref = self.db('XRef')
         self.dict['Rename'] = {}
         ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         ufile = '%s.dat' % self.info['Basefile']
         if os.path.exists(ufile) and not self.opt['Force']: uni.readUniProt(ufile,clear=True,cleardata=False)
         else:
             uni.readUniProt(clear=True,acclist=rje.sortKeys(xref.index('UniProt')),cleardata=False)
             uni.saveUniProt(ufile)
         ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniseq = {}
         for entry in uni.entries():
             seq = entry.obj['Sequence']
             uniseq[seq.info['AccNum']] = seq
         self.printLog('\r#USEQ','%s UniProt Sequences extracted (%s Ensembl AccNum)' % (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt')))))
         ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         yseq = []       # List of YEAST sequence objects
         (sx,stot) = (0.0,inseq.seqNum())
         for seq in inseq.seqs():
             self.progLog('\r#SEQ','Reformatting sequence names: %.2f%%' % (sx/stot)); sx += 100.0
             if seq.info['SpecCode'] != 'YEAST': continue
             yseq.append(seq)
             sgd = seq.info['AccNum']; newname = seq.info['Name']
             try:
                 for x in xref.indexEntries('EnsG',sgd):
                     acc = x['UniProt']
                     if acc: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD'],acc)
                     else: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD']); continue
                     if acc not in uniseq: self.printLog('\r#UNIERR','Unable to find UniProt sequence %s (%s)' % (acc,sgd)); continue
                     useq = uniseq[acc]
                     if useq.info['Sequence'] != seq.info['Sequence']: self.printLog('\r#SEQERR','%s sequence <> %s sequence' % (sgd,acc)); continue
                     nsplit = string.split(newname)
                     nsplit[0] = '%s__%s' % (x['UniprotID'],acc)
                     newname = string.join(nsplit)
                     self.dict['Rename'][sgd] = acc
                     break
             except: self.errorLog('%s problem' % sgd)
             seq.info['Name'] = newname
             seq.extractDetails(gnspacc=True)
         self.printLog('\r#SEQ','Reformatting sequence names complete.')
         ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not rje.exists('%s.ygob.fas' % self.info['Basefile']):
             inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile'])
         if not rje.exists('%s.yeast.fas' % self.info['Basefile']):
             inseq.saveFasta(seqs=yseq,seqfile='%s.yeast.fas' % self.info['Basefile'])
         self.list['YeastSeq'] = inseq.accList(yseq)
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
Пример #27
0
 def makePPI(self):  ### Generates files for Human-HIV PPI analysis
     '''Generates files for Human-HIV PPI analysis.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % self.getStr('HIVSeq'),'autoload=T'])
         if not seqlist.seqs(): return False
         seqmap = seqlist.seqNameDic('Max')
         mdb = self.db('HHPIDMap')
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for hivacc in mdb.index('AccHIV'):
             # map HIV accession numbers on to sequences seqNameDic
             accnum = string.split(hivacc,'.')[0]
             hivseq = seqmap[accnum]              
             # extract short HIV name from sequence ID
             hivgene = string.split(hivseq.shortName(),'_')[0].upper()
             # create directory named after HIV gene
             #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene))
             rje.mkDir(self,'%s/' % hivgene,log=True)
             # copy human PPI files into directories, adding HIV gene
             ex = 0.0; etot = len(mdb.index('AccHIV')[hivacc])
             for entry in mdb.indexEntries('AccHIV',hivacc):
                 self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s %s PPI' % (rje.iStr(etot),hivgene))
                 pfile = self.getStr('PPIDir') + entry['Symbol'] + '.ppi.fas'
                 if rje.exists(pfile):
                     FAS = open('%s/%s.%s.ppi.fas' % (hivgene,hivgene.lower(),entry['Symbol']),'w')
                     FAS.write('>%s\n%s\n' % (hivseq.info['Name'],hivseq.getSequence()))
                     FAS.write(open(pfile,'r').read())
                     FAS.close()
                 else: self.errorLog('Cannot find human PPI file for %s interactor "%s"' % (entry['HIV'],entry['Symbol']),printerror=False)
             self.printLog('\r#PPI','Generated human-HIV PPI fasta files for %s %s (%s) PPI.' % (rje.iStr(etot),entry['HIV'],hivgene))                                      
     except: self.errorLog('%s.makePPI error' % self); return False
Пример #28
0
    def filterSPCode(
        self
    ):  ### Filters species codes according to mincount and shared taxa at different levels.
        '''Filters species codes according to mincount and shared taxa at different levels.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            db = self.db()
            tax = self.obj['Taxonomy']
            ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            specdb = self.db('spcode')
            parents = {}  # Dictionary of {spcode:parents}
            taxsum = {}
            # Reduced according to low abundance and/or higher level taxa of species in clade
            fx = 0
            bfx = 0
            ufx = 0
            for ekey in specdb.dataKeys():
                entry = specdb.data(ekey)
                if entry['spcode'] == 'None':
                    entry['boot'] = self.getNum('NoneBoot')
                    continue
                if entry['boot'] < self.getNum('BootFilter'):
                    self.printLog(
                        '#FILT',
                        '%s: filtered -> "Uncertain" (bootstrap %s < bootfilter=%s).'
                        % (entry['protein'], entry['boot'],
                           self.getNum('BootFilter')))
                    entry['spcode'] = 'Uncertain'
                    bfx += 1
                    continue
                #self.debug(entry)
                spcodes = string.split(entry['spcode'], '|')
                for spcode in spcodes[0:]:
                    if spcode not in parents:
                        parents[spcode] = []
                        try:
                            taxid = tax.mapToTaxID(spcode,
                                                   nodeonly=True,
                                                   warn=False)[0]
                        except:
                            continue
                        while taxid in tax.dict['Parent']:
                            taxid = tax.dict['Parent'][taxid]
                            parsp = tax.getSpCode(taxid,
                                                  invent=False,
                                                  warn=False)
                            if parsp: parents[spcode].append(parsp)
                    if not parents[spcode] and len(spcodes) > 1:
                        self.printLog(
                            '#FILT', '%s: filtered unmapped spcode %s.' %
                            (entry['protein'], spcode))
                        spcodes.remove(spcode)
                        ufx += 1
                    for parsp in parents[spcode]:
                        if parsp in spcodes:
                            self.printLog(
                                '#FILT', '%s: filtered %s as parent of %s.' %
                                (entry['protein'], parsp, spcode))
                            spcodes.remove(parsp)
                            fx += 1
                for taxon in spcodes[0:]:
                    if taxon not in taxsum: taxsum[taxon] = 0.0
                    if self.getBool('BootWeight'): taxweight = entry['boot']
                    else: taxweight = 1.0
                    taxsum[taxon] += taxweight / len(spcodes)
                entry['spcode'] = string.join(spcodes, '|')
            self.printLog(
                '#FILT',
                'Filtered %s species codes with co-occurring child taxa' %
                rje.iStr(fx))
            self.printLog(
                '#FILT',
                'Filtered %s unmapped species codes with co-occurring mapped taxa'
                % rje.iStr(ufx))
            if self.getNum('BootFilter') > 0.0:
                self.printLog(
                    '#FILT',
                    'Filtered %s proteins with bootstrap < bootfilter=%s' %
                    (rje.iStr(bfx), self.getNum('BootFilter')))
            #self.debug(entry)

            fx = 0
            for ekey in specdb.dataKeys():
                entry = specdb.data(ekey)
                if entry['spcode'] in ['None', 'Uncertain']: continue
                #self.debug(entry)
                spcodes = string.split(entry['spcode'], '|')
                for spcode in spcodes[0:]:
                    if self.getNum('MinScore') > 0 and self.getNum(
                            'MinScore') > taxsum[spcode]:
                        self.printLog(
                            '#FILT', '%s: filtered %s < minscore=%s.' %
                            (entry['protein'], spcode,
                             self.getNum('MinScore')))
                        spcodes.remove(spcode)
                        fx += 1
                if spcodes: entry['spcode'] = string.join(spcodes, '|')
                else:
                    self.printLog(
                        '#FILT', '%s filter aborted: no spcode left!' %
                        (entry['protein']))
                #self.debug(entry)
            self.printLog(
                '#FILT',
                'Filtered %s species codes failing to meet minscore=%s.' %
                (rje.iStr(fx), self.getNum('MinScore')))

        except:
            self.errorLog('%s.filterSPCode error' % self.prog())
Пример #29
0
 def pileUpStats(self):  ### Calculates statistics of genetic differences from parsed PileUp Tables
     '''Calculates statistics of genetic differences from parsed PileUp Tables.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         statfile = '%s.pdiff.tdt' % self.baseFile()
         if not self.force() and os.path.exists(statfile): return self.pileUpFDR()
         ## ~ [0a] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         wtdata = {}     # Load lists of data for compiling
         for locus in self.dict['RefSeq']:
             wtdata[locus] = {}
             for field in ['N','QN','MajFreq']: wtdata[locus][field] = []
         WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 1
         fields = []
         for line in WTDATA:
             data = rje.readDelimit(line)
             if fields:
                 locus = data[0]
                 pos = int(data[1])
                 while pos > wx:
                     wtdata[locus]['N'].append(0); wtdata[locus]['QN'].append(0); wtdata[locus]['MajFreq'].append(0.0); wx += 1
                 for field in ['N','QN']: wtdata[locus][field].append(int(data[fields.index(field)]))
                 for field in ['MajFreq']: wtdata[locus][field].append(string.atof(data[fields.index(field)]))
                 wx += 1
             else: fields = data[0:]
         WTDATA.close()
         ## ~ [0b] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         mutdata = {}     # Load lists of data for compiling
         for locus in self.dict['RefSeq']:
             mutdata[locus] = {}
             for field in ['N','QN','Major','MajFreq','WTFreq']: mutdata[locus][field] = []
         MUTDATA = open('%s.Mut.tdt' % self.baseFile(),'r'); mx = 1
         fields = []
         for line in MUTDATA:
             data = rje.readDelimit(line)
             if fields:
                 locus = data[0]
                 self.str['RefSeq'] = self.dict['RefSeq'][locus]
                 pos = int(data[1])
                 try:
                     if pos > len(self.str['RefSeq']):
                         while (pos-1) > len(self.str['RefSeq']): self.str['RefSeq'] += '?'
                         self.str['RefSeq'] += data[2]
                         self.dict['RefSeq'][locus] = self.str['RefSeq']
                     elif self.str['RefSeq'][pos-1] == '?':
                         self.str['RefSeq'] = self.str['RefSeq'][:pos-1] + data[2] + self.str['RefSeq'][pos:]
                         self.dict['RefSeq'][locus] = self.str['RefSeq']
                 except: self.warnLog('Problem mapping Pos %s onto %snt %s RefSeq' % (rje.iStr(pos),locus,rje.iLen(self.str['RefSeq'])))
                 while pos > mx:
                     mutdata[locus]['N'].append(0); mutdata[locus]['QN'].append(0); mutdata[locus]['Major'].append('-'); mutdata[locus]['MajFreq'].append(0.0); mutdata[locus]['WTFreq'].append(0.0); mx += 1
                 for field in ['N','QN']: mutdata[locus][field].append(int(data[fields.index(field)]))
                 for field in ['MajFreq','WTFreq']: mutdata[locus][field].append(string.atof(data[fields.index(field)]))
                 for field in ['Major']: mutdata[locus][field].append(data[fields.index(field)])
                 mx += 1
             else: fields = data[0:]
         MUTDATA.close()
         ## ~ [0c] Integrity check ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         #!# Need a new check with locus info #!#
         #for field in wtdata:    #!# Won't be true - not all reference genome positions present in output (0 mapped reads)
         #    if len(wtdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for WT %s' % field,printerror=False); raise ValueError
         #for field in mutdata:    #!# Won't be true - not all reference genome positions present in output (0 mapped reads)
         #    if len(mutdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for Mutant %s' % field,printerror=False); raise ValueError
         #self.printLog('#REF','WT and Mutant data for %s reference positions' % rje.iLen(self.str['RefSeq']))
         ### ~ [1] Assess and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'w')
         headers = ['Locus','Pos','Ref','WT.N','WT.QN','WT.Major','WT.MajFreq','Mut.N','Mut.QN','Mut.Major','Mut.MajFreq','Mut.WTFreq','p.Over','p.Under','p.Diff']
         SAMSIG.write('%s\n' % string.join(headers,'\t'))
         nodifx = 0; nomutx = 0; sx = 0
         for locus in rje.sortKeys(self.dict['RefSeq']):
             self.str['RefSeq'] = self.dict['RefSeq'][locus]
             self.list['WTMajor'] = self.dict['WTMajor'][locus]
             for i in range(len(self.str['RefSeq'])):
                 try:
                     sigdata = [locus,i+1,self.str['RefSeq'][i],wtdata[locus]['N'][i],wtdata[locus]['QN'][i],self.list['WTMajor'][i],wtdata[locus]['MajFreq'][i],
                                mutdata[locus]['N'][i],mutdata[locus]['QN'][i],mutdata[locus]['Major'][i],mutdata[locus]['MajFreq'][i],mutdata[locus]['WTFreq'][i]]
                 except: self.warnLog('Incomplete data for %s:%s (no pdiff output)' % (locus,rje.iStr(i+1))); continue
                 if self.getBool('MajDif') and self.list['WTMajor'][i] == mutdata[locus]['Major'][i]: nodifx += 1; continue   # Was: sigdata += [1.0,1.0]
                 elif self.getBool('MajMut') and self.str['RefSeq'][i] == mutdata[locus]['Major'][i]: nomutx += 1;continue
                 elif not wtdata[locus]['MajFreq'][i]:    # No Data for WT
                     if mutdata[locus]['WTFreq'][i]: sigdata += [0.0,1.0]
                     else: sigdata += [1.0,1.0]
                 elif mutdata[locus]['WTFreq'][i] > wtdata[locus]['MajFreq'][i]:
                     obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5)
                     sigdata.append(rje.binomial(obs,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self))
                     sigdata.append(1.0)
                 elif mutdata[locus]['WTFreq'][i] < wtdata[locus]['MajFreq'][i]:
                     obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5)
                     sigdata.append(1.0)
                     sigdata.append(1.0 - rje.binomial(obs+1,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self))
                 else: sigdata += [1.0,1.0]
                 sigdata.append(min(1.0,2*min(sigdata[-2:])))
                 rje.writeDelimit(SAMSIG,sigdata); sx += 1
         SAMSIG.close()
         ptxt = '%s lines output to *.pdiff.txt' % rje.iStr(sx)
         if self.getBool('MajDif'): ptxt += '; %s positions skipped where WTMajor==MutMajor (majdif=T)' % rje.iStr(nodifx)
         if self.getBool('MajMut'): ptxt += '; %s positions skipped where Ref==MutMajor (majmut=T)' % rje.iStr(nomutx)
         self.printLog('#PDIFF','%s.' % ptxt)
         ### ~ [2] FDR Correction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.pileUpFDR()
     except: self.errorLog('%s.pileUpStats() error' % (self)); return None
Пример #30
0
 def summaryScores(
         self,
         rankdb=None,
         sumstr='taxasum',
         minsum='MinSum'):  ### Generates summary scores from rank table.
     '''Generates summary scores from rank table.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         if not rankdb: rankdb = self.db('taxamap')
         sumdb = db.addEmptyTable(sumstr, [
             'rank', 'taxon', 'count', 'bootwt', 'meanboot', 'perc',
             'wtperc'
         ], ['rank', 'taxon'])
         ranks = ['genus', 'family', 'order', 'class', 'phylum']
         ### ~ [2] Normalise to reduced levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for rank in ranks:
             self.printLog('\r#RANK', 'Normalising %s data.' % rank)
             taxsum = {}
             ranksum = 0.0  # Summed counts for taxa and rank total
             taxwt = {}
             wtsum = 0.0  # Bootstrap-weighted summed counts for taxa and rank total
             bootsum = {}
             bootx = {
             }  # Sum and count of bootstrap values for mean boot numbers
             for entry in rankdb.entries():
                 taxa = string.split(entry[rank], '|')
                 for taxon in taxa:
                     if taxon in self.list['TaxFilter']: continue
                     if taxon not in taxsum:
                         taxsum[taxon] = 0.0
                         taxwt[taxon] = 0.0
                         bootsum[taxon] = 0.0
                         bootx[taxon] = 0
                     taxsum[taxon] += 1.0 / len(taxa)
                     ranksum += 1.0 / len(taxa)
                     taxweight = entry['boot']
                     bootsum[taxon] += entry['boot']
                     bootx[taxon] += 1
                     taxwt[taxon] += taxweight / len(taxa)
                     wtsum += taxweight / len(taxa)
             otherx = 0
             for taxon in rje.sortKeys(taxsum):
                 if taxon == 'Other': continue
                 if taxsum[taxon] < self.getNum(minsum):
                     if 'Other' not in taxsum:
                         taxsum['Other'] = 0.0
                         taxwt['Other'] = 0.0
                         bootsum['Other'] = 0.0
                         bootx['Other'] = 0.0
                     taxsum['Other'] += taxsum.pop(taxon)
                     taxwt['Other'] += taxwt.pop(taxon)
                     bootsum['Other'] += bootsum.pop(taxon)
                     bootx['Other'] += bootx.pop(taxon)
                     otherx += 1
             self.printLog(
                 '#MINSUM',
                 '%s %s taxa converted to "Other" (count < minsum=%s)' %
                 (rje.iStr(otherx), rank, self.getNum(minsum)))
             for taxon in taxsum:
                 sumdb.addEntry({
                     'rank':
                     rank,
                     'taxon':
                     taxon,
                     'count':
                     rje.dp(taxsum[taxon], 1),
                     'perc':
                     rje.sf(100.0 * taxsum[taxon] / ranksum),
                     'bootwt':
                     rje.dp(taxwt[taxon], 1),
                     'meanboot':
                     rje.dp(bootsum[taxon] / bootx[taxon], 3),
                     'wtperc':
                     rje.sf(100.0 * taxwt[taxon] / wtsum)
                 })
         ## ~ [2a] Rank taxa by counts such that highest is Rank 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         sumdb.rankFieldByIndex('rank',
                                'count',
                                rev=True,
                                absolute=True,
                                lowest=True)
         sumdb.rankFieldByIndex('rank',
                                'bootwt',
                                rev=True,
                                absolute=True,
                                lowest=True)
         ## ~ [2b] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         sumdb.saveToFile()
     except:
         self.errorLog('%s.summaryScores error' % self.prog())
Пример #31
0
 def batchRun(self, returnobj=False):  ### Execute batch mode runs
     '''Execute batch mode runs.'''
     try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         barg = self.getStrLC('BatchArg')
         if not barg:
             raise ValueError(
                 'Cannot use batchrun=FILELIST if batcharg=None.')
         batchfiles = self.list['BatchRun'][0:]
         self.list['BatchRun'] = []  # Avoid recursive running!
         blog = self.getStr('BatchLog')
         if not blog.startswith('.'): blog = '.%s' % blog
         if not blog.endswith('.log'): blog = '%s.log' % blog
         rawcmd = self.cmd_list[0:]
         rawlog = self.log
         batchobj = []
         ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         bx = 0
         for bfile in batchfiles:
             bx += 1
             self.printLog(
                 '#BATCH', 'Batch running %s of %s: %s=%s' %
                 (rje.iStr(bx), rje.iLen(batchfiles), barg, bfile))
             ## Setup parameters
             bbase = rje.baseFile(bfile, strip_path=True)
             bcmd = ['%s=%s' % (barg, bfile)]
             if self.getBool('BatchBase'):
                 if blog == '.log': bcmd += ['basefile=%s' % bbase]
                 else:
                     bcmd += ['basefile=%s%s' % (bbase, rje.baseFile(blog))]
             elif self.getStrLC('BatchLog'):
                 bcmd += ['log=%s%s' % (bbase, blog)]
             else:
                 bcmd += ['newlog=F']
             #self.debug(bcmd)
             ## Setup Seqsuite object
             self.cmd_list = rawcmd + bcmd
             self.log = rje.setLog(
                 self.log.obj['Info'], self, self.cmd_list
             )  # Sets up Log object for controlling log file output
             ## Run
             batchobj.append(self.run())
             ## Finish and Tidy
             self.log = rawlog
             runobj = batchobj[-1]
             if runobj:
                 if not returnobj: batchobj[-1] = True
                 info = runobj.log.obj['Info']
                 self.printLog(
                     '#RUN',
                     '%s V%s run finished.' % (info.program, info.version))
             else:
                 self.warnLog('Batch run failed (%s=%s).' % (barg, bfile))
         ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         failx = batchobj.count(False)
         self.printLog(
             '#BATCH', '%s batch runs complete: %s failed.' %
             (rje.iLen(batchfiles), rje.iStr(failx)))
         self.list['BatchRun'] = batchfiles
         return batchobj
     except:
         self.errorLog('%s.batchRun error' % self)
         return False
Пример #32
0
    def parseMITAB(self):   ### Parse MITAB file into pairwise PPI table.
        '''Parse MITAB file into pairwise PPI table.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            xref = self.obj['XRef']
            pdb = self.db('pairwise')
            pfields = ['Hub','Spoke','HubUni','SpokeUni','HubTaxID','SpokeTaxID','Evidence','IType']
            headers = {}
            for h in range(len(self.list['Headers'])): headers[self.list['Headers'][h]] = h
            dbsource = self.getStr('DBSource')
            ### ~ [2] Read through MITAB ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mx = 0; ex = 0; fax = 0; ftx = 0; fx = 0; uhx = 0; usx = 0
            epos = self.endPos('MITAB')
            complexidlist = []
            badtaxa = ['-']
            baduni = []
            while 1:
                self.progLog('\r#MITAB','Parsing %s MITAB %s: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,self.fileProg('MITAB',epos),rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist)))
                mline = self.readDelimit('MITAB'); mx += 1
                if not mline: break
                entry = {'#':pdb.entryNum()}
                for field in pfields: entry[field] = ''
                ## ~ [2a] Add iRefIndex complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                complexid = {}     # This will take the first complex ID
                if 'irigid' in self.list['Headers'] and 'numParticipants' in self.list['Headers']:
                    if int(mline[headers['numParticipants']]) > 2:
                        complexid['A'] = complexid['B'] = 'rigid:%s' % mline[headers['irigid']]
                        #self.bugPrint(mline)
                        #self.debug(complexid)
                ## ~ [2b] Parse and check taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                taxa = {'A':'','B':''}
                for tfield in self.list['TaxaField']:
                    ab = tfield[-1:].upper()
                    if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',tfield.upper())[0]
                    try:
                        taxon = rje.matchExp('^taxid:(\d+)',mline[headers[tfield]].lower())[0]
                        if self.list['TaxID'] and taxon not in self.list['TaxID']: continue
                        taxa[ab] = taxon
                    except:
                        taxon = mline[headers[tfield]]
                        if taxon not in badtaxa:
                            badtaxa.append(taxon)
                            self.warnLog('No TaxID read from %s: "%s"' % (tfield,taxon),'no_tax',suppress=True)
                        if not self.list['TaxID']: taxa[ab] = '-'
                if not taxa['A'] and complexid: taxa['A'] = taxa['B']
                if not taxa['B'] and complexid: taxa['B'] = taxa['A']
                if not (taxa['A'] and taxa['B']): ftx += 1; continue
                ## ~ [2c] Parse protein IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ids = {'A':[],'B':[]}
                uni = {'A':'','B':''}
                for ifield in self.list['IDField']:
                    ab = ifield[-1:].upper()
                    if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',ifield.upper())[0]
                    # Split IDs on | then db:id vs self.list['MapDB']
                    for pid in string.split(mline[headers[ifield]],'|'):
                        try: (db,dbid) = string.split(pid,':',1)
                        except: continue
                        if db.lower() in ['uniprotkb'] and '(' in dbid: continue    # Only map uniprotkb accnum
                        dbid = string.split(dbid,'(')[0]
                        dbid = string.split(dbid,';')[0]
                        if db.lower() in ['uniprotkb']:
                            svid = dbid
                            dbid = string.split(svid,'-')[0]
                        if ab not in complexid:     # First identifier for A/B
                            if db.lower() in self.list['Complex']: complexid[ab] = pid; ids[ab].append(pid)
                            else: complexid[ab] = ''
                        if not self.list['MapDB'] or db.lower() in self.list['MapDB']: ids[ab].append(dbid)
                        # Parse uniprot directly if possible
                        if db.lower() in ['uniprotkb'] and not uni[ab]:
                            if self.getBool('SpliceVar'): uni[ab] = svid
                            else: uni[ab] = dbid
                #self.bugPrint(ids)
                ## ~ [2d] Map parsed IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                amb = {'A':False,'B':False}
                if not ids['A'] or not ids['B']:
                    #self.bugPrint('%s\n=> ID Failure' % mline)
                    #self.bugPrint(ids['A']); self.bugPrint(ids['B'])
                    #self.bugPrint(entry)
                    fx += 1; continue
                for ida in ids['A']:
                    #self.debug('%s => %s (or %s)' % (ida,xref.xref(ida,unique=True),xref.xref(ida,unique=False)))
                    if not entry['Hub']: entry['Hub'] = xref.xref(ida,unique=True,usedict=True)
                    if entry['Hub'] == False: amb['A'] = True
                    #if not entry['HubUni']: entry['HubUni'] = xref.xref(ida,self.getStr('UniField'),unique=True,usedict=True)
                    if not entry['HubUni']: entry['HubUni'] = self.getUniXRef(ida)
                if self.getBool('AddUni') and not entry['HubUni']:
                    entry['HubUni'] = uni['A']
                    if uni['A'] and uni['A'] not in baduni: baduni.append(uni['A'])
                if not entry['Hub'] and entry['HubUni']:
                    entry['Hub'] = entry['HubUni']
                    #self.warnLog('UniprotKB "%s" used for Hub' % entry['HubUni'],'unihub',suppress=True)
                    uhx += 1
                if not entry['Hub'] and complexid['A']:
                    entry['Hub'] = complexid['A']
                else: complexid['A'] = ''
                if self.getBool('UniOnly') and not complexid['A'] and not entry['HubUni']: entry['Hub'] = ''
                for idb in ids['B']:
                    if not entry['Spoke']: entry['Spoke'] = xref.xref(idb,unique=True,usedict=True)
                    if entry['Spoke'] == False: amb['B'] = True
                    #if not entry['SpokeUni']: entry['SpokeUni'] = xref.xref(idb,self.getStr('UniField'),unique=True,usedict=True)
                    if not entry['SpokeUni']: entry['SpokeUni'] = self.getUniXRef(idb)
                if self.getBool('AddUni') and not entry['SpokeUni']: entry['SpokeUni'] = uni['B']
                if not entry['Spoke'] and entry['SpokeUni']:
                    entry['Spoke'] = entry['SpokeUni']
                    #self.warnLog('UniprotKB "%s" used for Spoke' % entry['SpokeUni'],'unihub',suppress=True)
                    usx += 1
                if not entry['Spoke'] and complexid['B']:
                    entry['Spoke'] = complexid['B']
                else: complexid['B'] = ''
                if self.getBool('UniOnly') and not complexid['B'] and not entry['SpokeUni']:
                    entry['Spoke'] = ''
                    if uni['B'] and uni['B'] not in baduni: baduni.append(uni['B'])
                if complexid['A'] and complexid['B']:
                    if not (complexid['A'].startswith('rigid:') and complexid['B'].startswith('rigid:')):
                        self.printLog('\r#MITAB','',log=False)
                        self.warnLog('Cannot parse complex:complex PPI (%s & %s)' % (complexid['A'],complexid['B']),'complex-complex',suppress=True)
                    entry['Hub'] = entry['Spoke'] = ''
                #self.bugPrint(entry)
                #self.debug(complexid)
                if not (entry['Hub'] and entry['Spoke']):
                    if (entry['Hub'] or amb['A']) and (entry['Spoke'] or amb['B']):
                        fax += 1; continue
                    #self.bugPrint(mline); self.debug(entry)
                    fx += 1; continue
                #if self.dev() and 'PCNA' not in [entry['Hub'],entry['Spoke']]: continue
                entry['HubTaxID'] = taxa['A']
                entry['SpokeTaxID'] = taxa['B']
                if complexid['A'] and complexid['A'] not in complexidlist: complexidlist.append(complexid['A'])
                if complexid['B'] and complexid['B'] not in complexidlist: complexidlist.append(complexid['B'])
                #if complexid['A'] or complexid['B']: self.debug(entry)
                ## ~ [2c] Parse evidence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                #self.bugPrint(mline)
                evidence = []
                for tfield in self.list['MethodField']:
                    #self.bugPrint(string.split(mline[headers[tfield]],'|'))
                    for etype in string.split(mline[headers[tfield]],'|'):
                        ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype)
                        if ematch: evidence.append('%s:%s' % (dbsource,ematch[0]))
                if not evidence: evidence.append('%s:unknown' % (self.getStr('DBSource')))
                evidence = rje.sortUnique(evidence)
                #self.debug(evidence)
                entry['Evidence'] = string.join(evidence,'|')
                ## ~ [2d] Parse interaction types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                itypes = []
                for tfield in self.list['TypeField']:
                    #self.bugPrint(string.split(mline[headers[tfield]],'|'))
                    for etype in string.split(mline[headers[tfield]],'|'):
                        ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype)
                        if ematch: itypes.append(ematch[0])
                if not itypes: itypes.append('unknown')
                itypes = rje.sortUnique(itypes)
                #self.debug(itypes)
                entry['IType'] = string.join(itypes,'|')
                pdb.addEntry(entry); ex += 1
                if self.dev() and entry['Hub'] in ['KLF3']:#,'WDR5']:
                    self.printLog('#DEV',string.join(mline,'\t'))
                    #self.bugPrint(uni); self.debug(entry)
                if self.getBool('Symmetry') and not complexid['A'] and not complexid['B']:
                    pdb.addEntry({'#':pdb.entryNum(),'Hub':entry['Spoke'],'Spoke':entry['Hub'],
                                  'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'],
                                  'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'],
                                  'Evidence':entry['Evidence'],'IType':entry['IType']})
            self.printLog('\r#MITAB','Parsing %s MITAB complete: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist)))
            self.close('MITAB')
            if (uhx+usx): self.warnLog('UniprotKB IDs used for %s Hub and %s Spoke IDs.' % (rje.iStr(uhx),rje.iStr(usx)))
            if baduni:
                baduni.sort()
                accout = '%s.%s.unmapped.uniacc' % (self.baseFile(),dbsource)
                self.warnLog('%s unmapped UniprotKB IDs used: output to %s.' % (rje.iLen(baduni),accout))
                open(accout,'w').write(string.join(baduni,'\n'))

            ### ~ [3] Convert complexes to pairwise PPIs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not complexidlist: return pdb
            self.printLog('#CPLEX','%s complex IDs parsed to convert to pairwise PPI.' % rje.iLen(complexidlist))
            ## ~ [3a] Assemble complex memberships ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            complexes = {}; chentries = []; csentries = []
            cevidence = {}  # List of Evidence for each complex
            citypes = {}    # List of ITypes for each complex
            ctaxa = {}
            ex = 0.0; etot = pdb.entryNum()
            for entry in pdb.entries():
                self.progLog('\r#CPLEX','Assembling complexes: %.1f%%' % (ex/etot)); ex += 100.0
                if entry['Hub'] in complexidlist:
                    cid = entry['Hub']
                    if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = []
                    complexes[cid].append(entry['Spoke'])
                    ctaxa[entry['Spoke']] = entry['SpokeTaxID']
                    cevidence[cid].append(entry['Evidence'])
                    citypes[cid].append(entry['IType'])
                    chentries.append(entry)
                elif entry['Spoke'] in complexidlist:
                    cid = entry['Spoke']
                    if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = []
                    complexes[cid].append(entry['Hub'])
                    ctaxa[entry['Hub']] = entry['HubTaxID']
                    cevidence[cid].append(entry['Evidence'])
                    citypes[cid].append(entry['IType'])
                    csentries.append(entry)
            self.printLog('\r#CPLEX','Assembled %s of %s complexes.' % (rje.iLen(complexes),rje.iLen(complexidlist)))
            #self.debug(complexes)
            ## ~ [3b] Update complexes dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            cppi = {}
            ex = 0.0; etot = len(complexes); rx = 0; px = 0; cmax = 0
            for cid in rje.sortKeys(complexes):
                self.progLog('\r#CPLEX','Reducing complexes: %.1f%%' % (ex/etot)); ex += 100.0
                if self.dev(): self.printLog('#DEV','Complex %s: %s' % (cid,complexes[cid]))
                if len(complexes[cid]) < 2:
                    complexes.pop(cid)
                    cevidence.pop(cid)
                    citypes.pop(cid)
                    rx += 1; continue
                complexes[cid].sort()
                #cevidence[cid] = string.join(rje.sortUnique(cevidence[cid]),'|')
                #citypes[cid] = string.join(rje.sortUnique(citypes[cid]),'|')
                cmax = max(cmax,len(complexes[cid]))
                #px += (len(complexes[cid]) * (len(complexes[cid])-1))
                members = complexes[cid][0:]
                while members:
                    hub = members.pop(0)
                    if self.dev() and hub == 'KLF3': self.debug(cid)
                    if hub not in cppi: cppi[hub] = {}
                    for spoke in members:
                        if spoke not in cppi[hub]:
                            cppi[hub][spoke] = []; px += 1
                            cppi[hub][spoke].append(cid)
            self.printLog('\r#CPLEX','Reduced %s complexes to %s > 1 member: %s ppi to add.' % (rje.iStr(etot),rje.iLen(complexes),rje.iStr(px)))
            ## ~ [3c] Update pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            cix = pdb.entryNum()
            for centry in chentries + csentries: pdb.dropEntry(centry)
            ex = 0.0; etot = len(cppi)
            for hub in rje.sortKeys(cppi):
                self.progLog('\r#CPLEX','Expanding complexes: %.1f%%' % (ex/etot)); ex += 100.0
                #hentry = {'Hub':hub,'HubUni':xref.xref(hub,self.getStr('UniField'),unique=True,usedict=True),'HubTaxID':ctaxa[hub]}
                hentry = {'Hub':hub,'HubUni':self.getUniXRef(hub),'HubTaxID':ctaxa[hub]}
                for spoke in rje.sortKeys(cppi[hub]):
                    evidence = []
                    itypes = []
                    ctypes = []
                    for cid in cppi[hub][spoke]:
                        evidence += cevidence[cid]
                        itypes += citypes[cid]
                        ctypes += string.split(cid,':')[0]
                    ctype = string.join(rje.sortUnique(ctypes),'|')
                    evidence = string.join(rje.sortUnique(evidence),'|')
                    if not evidence: evidence = '%s:%s' % (dbsource,ctype)
                    itypes = string.join(rje.sortUnique(itypes),'|')
                    if not itypes: itypes = ctype
                    #newentry = {'#':cix,'Spoke':spoke,'SpokeUni':xref.xref(spoke,self.getStr('UniField'),unique=True,usedict=True),'SpokeTaxID':ctaxa[spoke]}
                    newentry = {'#':cix,'Spoke':spoke,'SpokeUni':self.getUniXRef(spoke),'SpokeTaxID':ctaxa[spoke]}
                    newentry['Evidence'] = evidence
                    newentry['IType'] = itypes
                    entry = pdb.addEntry(rje.combineDict(newentry,hentry,overwrite=False)); cix += 1
                    if self.dev() and entry['Hub'] in ['KLF3','WDR5']: self.debug('Complex: %s' % entry)
                    if self.getBool('Symmetry'):
                        pdb.addEntry({'#':cix,'Hub':entry['Spoke'],'Spoke':entry['Hub'],
                                      'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'],
                                      'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'],
                                      'Evidence':entry['Evidence'],'IType':entry['IType']})
                        cix += 1
            self.printLog('#CPLEX','%s complex IDs expanded to pairwise PPI => %s ppi (symmetry=%s).' % (rje.iLen(complexidlist),rje.iStr(pdb.entryNum()),self.getBool('Symmetry')))
            return pdb
        except: self.errorLog('%s.parseMITAB error' % self.prog())
Пример #33
0
 def _digest(self): ### Main digestion of sequences and population of results database
     '''Main digestion of sequences and population of results database.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db('ProDigIS')
         prot_combo = self.protCombo()
         ## ~ [1] ~ Peptide Probability Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         pdb = self.db('PepProb'); pdict = {}
         if pdb:
             if self.getBool('CysWeight'):
                 for plen in pdb.index('PepSize').keys(): pdict[plen] = {}
                 for entry in pdb.entries(): pdict[entry['PepSize']][entry['CysCount']] = entry
             else:
                 for entry in pdb.entries(): pdict[entry['PepSize']] = entry
         ### ~ [2] Process each sequence in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.deBug(self.int)
         for prot in prot_combo:
             allpep = []; redundant = []; maxcys = 0
             sx = 0.0; stot = self.obj['SeqList'].seqNum() 
             for seq in self.obj['SeqList'].seqs():
                 self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (prot,sx/stot)); sx += 100.0
                 sequence = seq.getSequence()
                 for protease in string.split(prot,'+'):
                     for cut in proteases[protease]:
                         sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
                 for frag in string.split(sequence,':'):
                     if frag in allpep: redundant.append(frag)
                     else: allpep.append(frag); maxcys = max(maxcys,frag.count('C'))
             self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (prot,rje.iStr(stot)))
             if self.getBool('CysCount'):
                 for c in range(maxcys+1): db.addField('Cys%d' % c)
         ### ~ [3] Process each sequence in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             sx = 0.0; stot = self.obj['SeqList'].seqNum() 
             for seq in self.obj['SeqList'].seqs():
                 self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (prot,sx/stot)); sx += 100.0
                 acc = seq.getStr('AccNum')
                 ## ~ [2a] ~ Create new database entry to fill with data ~~~~~~~~~~~~~~~~~~~~~~~ ##
                 entry = {'AccNum':acc,'Protease':prot}
                 for i in range(1,self.getInt('MaxPepLen')+1):
                     entry[i] = 0
                     if self.getBool('PepMWt'): entry[i*100.0] = 0
                 sequence = seq.getSequence()
                 ## ~ [2b] ~ For each recognition site of each protease, mark cuts with ":" ~~~~ ##
                 for protease in string.split(prot,'+'):
                     for cut in proteases[protease]:
                         sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
                 ## ~ [2c] ~ Cut into fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 frag = string.split(sequence,':')
                 while '' in frag: frag.remove('')
                 self.deBug(frag)
                 entry['PepCount'] = len(frag)
                 if not self.getBool('NTerm'): frag = frag[1:]
                 if self.getInt('MinPepLen') > 0: 
                     for pep in frag[0:]:
                         if len(pep) < self.getInt('MinPepLen'): frag.remove(pep)
                 entry['MinPepLen'] = len(frag)
                 if self.getBool('NRPep'):
                     for pep in frag[0:]:
                         if pep in redundant: frag.remove(pep)
                     entry['NRPep'] = len(frag)
                 if self.getBool('CysCount'):
                     for c in range(maxcys+1): entry['Cys%d' % c] = 0
                     for pep in frag: entry['Cys%d' % pep.count('C')] += 1
                 if pdict: entry['LenExp'] = 0.0; entry['MWtExp'] = 0.0; entry['Len7Exp'] = 0.0
                 ## ~ [2d] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 for pep in frag[0:]:
                     plen = min(len(pep),self.getInt('MaxPepLen'))
                     self.deBug('"%s" -> %d' % (pep,plen))
                     entry[plen] += 1
                     if pdict:
                         if self.getBool('CysWeight'):
                             try: pprob = pdict[plen][pep.count('C')]['Prob']
                             except: pprob = 0.0
                         else: pprob = pdict[plen]['Prob']
                     if pdict: entry['LenExp'] += pprob
                     if pdict and 7 <= plen: entry['Len7Exp'] += pprob
                     if self.getBool('PepMWt'):
                         pwt = 100.0 * min(int((rje_sequence.MWt(pep)+99)/100.0),self.getInt('MaxPepLen'))
                         entry[pwt] += 1
                         if pdict: entry['MWtExp'] += pprob
                 entry['Len3'] = rje.logPoisson(3,entry['LenExp'],callobj=self)
                 if self.getBool('PepMWt'): entry['MWt3'] = rje.logPoisson(3,entry['MWtExp'],callobj=self)
                 entry['Len5'] = rje.logPoisson(5,entry['LenExp'],callobj=self)
                 if self.getBool('PepMWt'): entry['MWt5'] = rje.logPoisson(5,entry['MWtExp'],callobj=self)
                 entry['Len37'] = rje.logPoisson(3,entry['Len7Exp'],callobj=self)
                 db.addEntry(entry)
             self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (prot,rje.iStr(stot)))
     except: self.errorLog('%s._digest error' % self)
Пример #34
0
 def sgd2sp(
     self
 ):  ### Reformats yeast sequence names and outputs new data for GOPHER
     '''Reformats yeast sequence names and outputs new data for GOPHER.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         inseq = self.obj['SeqList']
         uni = rje_uniprot.UniProt(self.log,
                                   self.cmd_list + ['datout=None'])
         xref = self.db('XRef')
         self.dict['Rename'] = {}
         ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         ufile = '%s.dat' % self.info['Basefile']
         if os.path.exists(ufile) and not self.opt['Force']:
             uni.readUniProt(ufile, clear=True, cleardata=False)
         else:
             uni.readUniProt(clear=True,
                             acclist=rje.sortKeys(xref.index('UniProt')),
                             cleardata=False)
             uni.saveUniProt(ufile)
         ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniseq = {}
         for entry in uni.entries():
             seq = entry.obj['Sequence']
             uniseq[seq.info['AccNum']] = seq
         self.printLog(
             '\r#USEQ',
             '%s UniProt Sequences extracted (%s Ensembl AccNum)' %
             (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt')))))
         ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         yseq = []  # List of YEAST sequence objects
         (sx, stot) = (0.0, inseq.seqNum())
         for seq in inseq.seqs():
             self.progLog(
                 '\r#SEQ',
                 'Reformatting sequence names: %.2f%%' % (sx / stot))
             sx += 100.0
             if seq.info['SpecCode'] != 'YEAST': continue
             yseq.append(seq)
             sgd = seq.info['AccNum']
             newname = seq.info['Name']
             try:
                 for x in xref.indexEntries('EnsG', sgd):
                     acc = x['UniProt']
                     if acc:
                         newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % (
                             seq.info['Name'], x['Gene'], x['EnsG'],
                             x['SGD'], acc)
                     else:
                         newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % (
                             seq.info['Name'], x['Gene'], x['EnsG'],
                             x['SGD'])
                         continue
                     if acc not in uniseq:
                         self.printLog(
                             '\r#UNIERR',
                             'Unable to find UniProt sequence %s (%s)' %
                             (acc, sgd))
                         continue
                     useq = uniseq[acc]
                     if useq.info['Sequence'] != seq.info['Sequence']:
                         self.printLog(
                             '\r#SEQERR',
                             '%s sequence <> %s sequence' % (sgd, acc))
                         continue
                     nsplit = string.split(newname)
                     nsplit[0] = '%s__%s' % (x['UniprotID'], acc)
                     newname = string.join(nsplit)
                     self.dict['Rename'][sgd] = acc
                     break
             except:
                 self.errorLog('%s problem' % sgd)
             seq.info['Name'] = newname
             seq.extractDetails(gnspacc=True)
         self.printLog('\r#SEQ', 'Reformatting sequence names complete.')
         ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not rje.exists('%s.ygob.fas' % self.info['Basefile']):
             inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile'])
         if not rje.exists('%s.yeast.fas' % self.info['Basefile']):
             inseq.saveFasta(seqs=yseq,
                             seqfile='%s.yeast.fas' % self.info['Basefile'])
         self.list['YeastSeq'] = inseq.accList(yseq)
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Пример #35
0
 def parsePileup(self,tname,filename,wtdb=None):  ### Extracts, filters and processes PileUp data
     '''Extracts, filters and processes PileUp data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         table = self.db().addEmptyTable(tname,['Locus','Pos','Seq','N','QN','Major','MajFreq'],keys=['Locus','Pos'])
         qc = []
         if wtdb: table.addField('WTFreq')
         PILEUP = open(filename,'r'); px = 0; ex = 0
         PILEOUT = open('%s.%s.tdt' % (self.baseFile(),tname),'w')
         rje.writeDelimit(PILEOUT,outlist=table.fields(),delimit='\t')
         locus = None
         refseq = ''     #? What is this used for?
         majors = []     #? What is this used for?
         ### ~ [2] Process each entry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for line in PILEUP:
             # Split line up into data. Should be: locus, position, reference, no. reads, read data, qualscores
             data = string.split(rje.chomp(line))
             if not data: break
             self.progLog('\r#PARSE','Parsing %s: %s pos...' % (filename,rje.iStr(px)),rand=0.01); px += 1
             ## ~ [2a] Extract Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             entry = {'Locus':data[0],'Pos':int(data[1]),'Seq':data[2],'N':int(data[3]),'QN':0}
             if entry['Locus'] != locus: locus = entry['Locus']; refseq = ''; majors = []
             refseq += data[2]
             #entry => 'Ref','Pos','Seq','N','Reads','Qual'
             rseq = data[4]
             reads = []
             delx = 0
             while rseq:                    
                 try:
                     if rseq[:1] in ['.',',']: reads.append(entry['Seq']); rseq = rseq[1:]
                     elif rseq[:1] == '^': rseq = rseq[2:]
                     #elif rseq[:1] == '*':
                     #    reads.append('-1%s' % entry['Seq'].upper())
                     #    rseq = rseq[1:]
                     elif rseq[:1] in ['-','+']:
                         ilen = string.atoi(rje.matchExp('^(\d+)',rseq[1:])[0])
                         indel = rseq[len('%s' % ilen)+1:][:ilen]
                         #self.deBug('%s: %s' % (rseq,indel))
                         if rseq[:1] == '-':
                             delx += 1
                             reads.append(rseq[:len('%s' % ilen)+ilen+1].upper())
                         else:
                             reads[-1] += indel.upper()
                         #self.deBug(reads[-1])
                         rseq = rseq[len('%s' % ilen)+ilen+1:]
                     elif rseq[:1] in ['$']: rseq = rseq[1:]
                     else:
                         if rseq[0].upper() not in 'ATGCN*': print ' ???', rseq[0].upper(), '???'
                         reads.append(rseq[0].upper()); rseq = rseq[1:]
                 except:
                     self.errorLog('!')
                     self.deBug(rseq)
                     raise ValueError
             if len(reads) != (entry['N'] + delx):
                 self.deBug('%s = %d' % (data[4],entry['N']))
                 self.deBug('%s = %d' % (reads,len(reads)))
                 self.errorLog('Read versus Read Count mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False)
                 raise ValueError
             ## ~ [2b] Convert Quality Scores ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             qual = []
             for q in data[5]:
                 # Gaps do not have a quality score, so fill these in first
                 while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut'))
                 # Then append actual qv
                 qual.append(ord(q) - 33)
                 qc += [0] * (qual[-1] - len(qc)); qc[qual[-1]-1] += 1
             while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut'))
             while '*' in reads: reads[reads.index('*')] = '-'   #'-1%s' % entry['Seq'].upper()
             if len(reads) != len(qual):
                 self.deBug('%s = %d' % (reads,len(reads)))
                 self.deBug('%s = %d' % (qual,len(qual)))
                 self.deBug(data)
                 self.errorLog('Read versus Quality length mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False)
                 raise ValueError
             ## ~ [2c] Filter low quality ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if entry['Pos'] in [190359]:    #100,98901,183697,169284,
                 self.deBug(qual)
                 self.deBug(reads)
                 self.deBug(qc)
             # Remove (from back) any reads than do not meet QV cutoff
             for r in range(len(qual)-1,-1,-1):
                 if qual[r] < self.getInt('QCut'): qual.pop(r); reads.pop(r)
             entry['QN'] = len(reads)
             ## ~ [2d] Major Allele ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             alleles = {}    # Dictionary of {nt:count}
             # Setup major allele
             if reads: major = reads[0]
             else: major = '-'; alleles[major] = 0
             # Cycle through reads. Keep most abundant allele as major - or reference allele if tied.
             for read in reads:
                 if read in alleles: alleles[read] += 1
                 else: alleles[read] = 1
                 if alleles[read] > alleles[major] or (read == entry['Seq'] and alleles[read] == alleles[major]): major = read
             entry['Major'] = major
             majors.append(major)
             if reads: entry['MajFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[major]) / float(len(reads)))
             else: entry['MajFreq'] = 0.0
             if wtdb:
                 try:
                     wtmajor = self.dict['WTMajor'][locus][entry['Pos']-1]
                     if wtmajor in alleles and reads: entry['WTFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[wtmajor]) / float(len(reads)))
                     else: entry['WTFreq'] = 0.0
                     if wtmajor != major: self.debug(entry)
                     elif locus == 'chrIV_S288C__BK006938.2' and entry['Pos'] == 271733: self.debug(entry)
                 except: self.warnLog('WTFreq Error (%s:Pos=%d) [Probably no WT read mapped]' % (locus,entry['Pos'])); entry['WTFreq'] = 0.0
             if entry['Pos'] in [190359]:    #100,98901,183697,169284,
                 self.deBug(qual)
                 self.deBug(reads)
                 self.deBug(alleles)
                 self.deBug(entry)
                 self.deBug(line)
             #table.addEntry(entry)
             outlist = []
             for field in table.fields(): outlist.append(entry[field])
             rje.writeDelimit(PILEOUT,outlist,delimit='\t'); ex += 1
         self.printLog('\r#PARSE','Parsed %s: %s entries from %s lines.' % (filename,rje.iStr(ex),rje.iStr(px)))
         PILEOUT.close()
         PILEUP.close()
         ### ~ [3] Save QC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         QC = open('%s.%s.QC.tdt' % (self.baseFile(),tname),'w')
         QC.write('Qual\tCount\n')
         for q in range(len(qc)):
             try: QC.write('%d\t%d\n' % (q+1,qc[q]))
             except: self.errorLog('!')
         QC.close()
         return table
     except: self.errorLog('%s.parsePileup(%s) error' % (self,filename)); return None
Пример #36
0
 def seqSubset2(
     self
 ):  ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),
                                      mainkeys=['Ingolia'],
                                      name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile, mainkeys=['Gene'], name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(
                 ['sourcedata=%s' % xfile,
                  'aliases=%s' % afile])
             ing_genes = string.split(
                 string.join(
                     self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes:
                 ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog(
                         '#MAP',
                         'Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene)
                     ing_mgi.remove(gene)
             self.printLog(
                 '#BAD', 'Failed to map %s genes from Ignolia' %
                 rje.iLen(bad_genes))
             open('ingolia.bad.txt', 'w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),
                                     'EnsEMBL',
                                     sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna', 'pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + [
                         'seqin=%s' % seqfile,
                         'seqout=%s' % seqout, 'autofilter=T', 'autload=T',
                         'seqmode=file',
                         'gooddesc=%s' % string.join(ing_musg, ',')
                     ]
                     rje_seqlist.SeqList(self.log, seqcmd)
             mdb = self.db().addEmptyTable('map',
                                           ['Ingolia', 'Gene', 'EnsEMBL'],
                                           ['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia': gene, 'Gene': ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else:
                     entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + [
             'seqin=%s' % seqfile, 'autofilter=F', 'autload=T',
             'seqmode=file'
         ]
         iseq = rje_seqlist.SeqList(self.log, seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST', evalue='')
             while iseq.nextSeq():
                 (iname, icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)', iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL', musg):
                     if entry['ENST']:
                         entry['ENST'] += ',%s' % string.split(iname)[0]
                     else:
                         entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]': 'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST')
         sdb.addField('ENSP')
         sdb.addField('ENSI')
         ENST = open('IngExact.cdna.all.fa', 'w')
         ENSP = open('IngExact.pep.all.fa', 'w')
         ex = 0.0
         etot = sdb.entryNum()
         sx = 0
         fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog(
                 '\r#ING',
                 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot))
             ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']:
                 fx += 1
                 continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'], ','):
                 (tname, tseq) = iseq.getDictSeq(trans, format='tuple')
                 self.deBug('%s vs %s' %
                            (tseq[entry[icod] - 3:][:7], entry[icon]))
                 if tseq[entry[icod] - 3:][:7] == entry[icon]:
                     ipept = string.split(
                         rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog(
                     '\r#ING',
                     'Unable to find Harrington start for %s %s (%s)' %
                     (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             elif len(peptseq) < minpep:
                 self.printLog(
                     '\r#ING',
                     'Peptide from mapped Harrington start for %s %s (%s) too short!'
                     % (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             id = rje.preZero(int(ex / 100), etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write(
                 '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'],
                    entry['Gene'], mentry['Gene'], cdnaseq))
             ENSP.write(
                 '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id,
                    entry['Gene'], mentry['Gene'], peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close()
         ENSP.close()
         self.printLog(
             '\r#ING',
             'Output %s Ingolia peptides and transcripts. %s failed.' %
             (rje.iStr(sx), rje.iStr(fx)))
         return
     except:
         self.errorLog('%s.method error' % self)
Пример #37
0
 def mapToTaxID(self,taxa,nodeonly=False,rankonly=False,log=True,warn=True):  ### Maps taxa onto TaxID. If taxa is a list, will process each element.
     '''Maps taxa onto TaxID. If taxa is a list, will process each element. Returns a list.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not taxa: return []
         taxid = []
         ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tlist = True
         try: taxa.sort()
         except: tlist = False
         if tlist:
             tx = 0.0; ttot = len(taxa)
             if ttot > 1:
                 for t in taxa:
                     if log: self.progLog('\r#TAXID','Mapping to TaxID: %.1f%%' % (tx/ttot)); tx += 100.0
                     taxid += self.mapToTaxID(t,nodeonly,rankonly,log=False)
                 taxid = rje.sortUnique(taxid)
                 if log:
                     if ttot > 1: self.printLog('\r#TAXID','Mapped %s taxa to %s TaxID' % (rje.iStr(ttot),rje.iLen(taxid)))
             else:
                 t = taxa[0]
                 if log: self.progLog('\r#TAXID','Mapping %s to TaxID...' % t)
                 taxid = rje.sortUnique(self.mapToTaxID(t,nodeonly,rankonly,log=False))
                 if log: self.printLog('\r#TAXID','Mapped %s to %s TaxID' % (t,rje.iLen(taxid)))
             return taxid
         ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxmap = self.dict['TaxMap']; rankid = self.list['RankID']
         taxa = '%s' % taxa
         ## ~ [2a] Taxa ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if rje.matchExp('^(\d+)$', taxa):
             #if taxa not in taxmap: self.taxaChildren(taxa)
             #if taxa in rankid: return [taxa]
             if nodeonly:
                 if taxa in rankid or not rankonly: return [taxa]
                 else: return []
             if taxa not in taxmap:
                 if warn: self.warnLog('Cannot find TaxID %s!' % taxa,'Missing_TaxID',suppress=True)
                 return []
             parents = [taxa]
             while parents:
                 taxa = parents.pop(0)
                 #if taxa not in taxmap: self.taxaChildren(taxa)
                 if not rankonly or taxa in rankid: taxid.append(taxa)
                 parents += taxmap[taxa]
             return taxid
         ## ~ [2b] Species Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if taxa == string.replace(taxa.upper(),' ',''):
             greplines = os.popen('grep "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines()
             for entry in greplines:
                 try: taxid.append(rje.matchExp('^%s\s+\S+\s+(\d+):' % taxa,entry)[0])
                 except: pass
             if not taxid and warn: self.warnLog('Cannot find Species Code "%s"!' % taxa,'Missing_SpCode',suppress=True)
             if len(taxid) > 1: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
             return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
         ### ~ [3] Species name etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxa = taxa.replace('_',' ')
         ## ~ [3a] Grep from Uniprot ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         greplines = os.popen('grep -B 2 -i "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines()
         gtaxid = None; comid = []; synid = []
         for entry in greplines:
             try: gtaxid = rje.matchExp('^\S+\s+\S+\s+(\d+):',entry)[0]
             except: pass
             if rje.matchExp('s=(%s)\s*$' % taxa.lower(),entry.lower()): synid.append(gtaxid)
             elif rje.matchExp('c=(%s)\s*$' % taxa.lower(),entry.lower()): comid.append(gtaxid)
             elif rje.matchExp('=(%s)\s*$' % taxa.lower(),entry.lower()): taxid.append(gtaxid)
         if not taxid: taxid = comid
         if not taxid: taxid = synid
         if not taxid and warn: self.warnLog('Cannot find Taxon name "%s" in Uniprot!' % taxa,'Missing Taxon',suppress=True)
         if len(taxid) > 1:
             #self.bugPrint(string.join(greplines))
             #self.debug('%s %s %s' % (taxid,comid,synid))
             if warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
         if taxid: return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
         #self.debug(taxid)
         ## ~ [3b] Grep from NCBI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         greplines = os.popen('grep -i -e "\t%s\t" %s' % (taxa, self.getStr('NameMap'))).readlines()
         for entry in greplines:
             try:
                 #gtaxid = rje.matchExp('^(\d+)\s+\S\s+(\S.+)$',entry)
                 gtaxid = string.split(entry,'\t|\t')
                 if gtaxid[1].lower() == taxa.lower(): taxid.append(gtaxid[0])
                 elif gtaxid[2] and gtaxid[2].lower() == taxa.lower(): taxid.append(gtaxid[0])
             except: pass
         if len(taxid) > 1 and warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
         return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
     except: self.errorLog('%s.mapToTaxID() error' % (self)); raise
Пример #38
0
 def setup(self):  ### Main class setup method.
     '''Main class setup method.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log, self.cmd_list)
         self.db().basefile(self.basefile())
         self.list['Accuracy'] = [0, 1.0 - self.getNum('ErrPerBase')]
         ## ~ [1a] SMRTReads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         while self.getStrLC('SMRTUnits') not in ['reads', 'gb', 'mb']:
             txt = 'SMRTUnits "%s" not recognised'
             if self.getNum('SMRTReads') < 10: smrtunits = 'Gb'
             elif self.getNum('SMRTReads') > 10000: smrtunits = 'reads'
             else: smrtunits = 'Mb'
             if self.i() < 0 or rje.yesNo(
                     '%s: switch to (%s) %s?' %
                 (txt, self.getNum('SMRTReads'), smrtunits)):
                 self.setStr({'SMRTUnits': smrtunits})
             elif self.i() > 0:
                 self.setStr(
                     {'SMRTUnits': rje.choice('SMRTUnits (reads/Gb/Mb)?')})
             self.printLog('#UNITS',
                           '%s => %s' % (txt, self.getStr('SMRTUnits')))
         if self.getStrLC('SMRTUnits') in ['gb', 'mb']:
             smrttotal = self.getNum('SMRTReads') * {
                 'gb': 1e9,
                 'mb': 1e6
             }[self.getStrLC('SMRTUnits')]
             txt = '%s %s @ %.3f kb/read' % (self.getNum('SMRTReads'),
                                             self.getStr('SMRTUnits'),
                                             self.getNum('AvRead') / 1000.0)
             self.setNum({'SMRTReads': smrttotal / self.getNum('AvRead')})
             txt += ' => %s reads' % rje.iStr(int(self.getNum('SMRTReads')))
             self.printLog('#READS', txt)
         ## ~ [1b] XnList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         xnlist = []
         for xn in self.list['XnList']:
             if xn == '': continue
             try:
                 ixn = int(xn)
                 if xn not in [ixn, '%d' % ixn]:
                     self.printLog('#XN', '"%s" -> %dX' % (xn, ixn))
                 if ixn == 0:
                     self.printLog(
                         '#XN', 'No point in 0X output: use 1-%Coverage.')
                 elif ixn == 1:
                     self.printLog('#XN',
                                   'No point in 1X output: use %Coverage.')
                 else:
                     xnlist.append(ixn)
             except:
                 self.errorLog(
                     'Could not process %s as part of XnList. (Integers only.)'
                     % xn)
         xnlist.sort()
         if xnlist:
             self.printLog(
                 '#XN', 'XnList: %sX.' %
                 string.join(string.split('%s' % xnlist, ','), 'X, ')[1:-1])
         self.list['XnList'] = xnlist
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self.prog())
         return False  # Setup failed
Пример #39
0
 def difference(self,table1,table2): ### Generates differences as new table
     '''
     Generates differences as new table.
     >> table1:Table = iTunes database table to compare
     >> table2:Table = iTunes database table to compare
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         dfields = ['Name','Artist','Composer','Album','Album_Artist','Genre','Time','Disc Number','Disc Count','Track Number','Track Count','Year','Date Added','Plays','Last Played','Skips','Last Skipped','My Rating','Location','Tracks','Score']
         db = self.db()
         tabindex = '#Artist#|#Album#|#Track Number#|#Name#'
         try:
             age1 = string.atoi(string.split(table1.name(),'.')[-1])
             age2 = string.atoi(string.split(table2.name(),'.')[-1])
             table1.index(tabindex,make=True)
             table2.index(tabindex,make=True)
             if age1 < age2: oldtable = table1; newtable = table2; newdate = age2
             else: newtable = table1; oldtable = table2; newdate = age1              
             diftable = db.copyTable(newtable,'%s-%s' % (oldtable.name(),string.split(newtable.name(),'.')[-1]))
             diftable.keepFields(dfields+[tabindex])
             diftable.addField('Status')
         except: self.errorLog('Cannot generate differences for %s and %s' % (table1,table2))
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#NEW','%s tracks in new iTunes export.' % rje.iStr(newtable.entryNum()))
         self.printLog('#OLD','%s tracks in old iTunes export.' % rje.iStr(oldtable.entryNum()))
         oldfiles = oldtable.datakeys()[0:]
         for entry in diftable.entries():
             ## ~ [2a] Find pair of entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if entry['Location'] in oldfiles: oldentry = oldtable.data(entry['Location'])
             elif entry[tabindex] in oldtable.index(tabindex): 
                 oldentry = oldtable.indexEntries(tabindex,entry[tabindex])[0]
                 if len(oldtable.indexEntries(tabindex,entry[tabindex])) == 1: pass
                 else:
                     self.printLog('#DUP','Duplicate entries for %s' % entry[tabindex])
                     for ientry in oldtable.indexEntries(tabindex,entry[tabindex]):
                         if ientry['Location'] in oldfiles: oldentry = ientry; break
             else: oldentry = None
             #self.deBug(entry)
             #self.deBug(oldentry)
             ## ~ [2b] Generate Differences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if not oldentry:
                 entry['Status'] = 'New'
                 continue
             #self.deBug(oldentry['Location'] in oldfiles)
             if oldentry['Location'] in oldfiles: oldfiles.remove(oldentry['Location'])
             #self.deBug(len(oldfiles))
             changed = False
             for field in ['Plays','Skips','My Rating']:
                 if entry[field] != oldentry[field]: changed = True
                 try: entry[field] -= oldentry[field]
                 except: pass    # Keep new value - probably empty in old entry
             if changed: entry['Status'] = 'Changed'
             else: entry['Status'] = 'Unchanged'            
         ### ~ [3] Add missing old entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         reportdel = rje.yesNo('Report deleted %s tracks?' % diftable.name())
         for old in oldfiles:
             entry = diftable.addEntry(oldtable.data(old))
             entry['Status'] = 'Deleted'
             if reportdel: self.printLog('#DEL','%s: %s [%s]' % (entry['Artist'],entry['Name'],entry['Album']))
         ### ~ [4] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for status in rje.sortKeys(diftable.index('Status')):
             self.printLog('#STAT','%s: %d tracks' % (status.upper(),len(diftable.index('Status')[status])))
         self.printLog('#TRACK','%s tracks in total' % rje.iStr(diftable.entryNum()))
         self.deBug('?')
         for table in [table1,table2,diftable]: table.dropField(tabindex)
         diftable.saveToFile('%s.tdt' % diftable.name())
     except: self.errorLog('%s.difference() error' % self)
Пример #40
0
 def summaryScores(self,rankdb=None,sumstr='taxasum',minsum='MinSum'):   ### Generates summary scores from rank table.
     '''Generates summary scores from rank table.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         if not rankdb: rankdb = self.db('taxamap')
         sumdb = db.addEmptyTable(sumstr,['rank','taxon','count','bootwt','meanboot','perc','wtperc'],['rank','taxon'])
         ranks = ['genus','family','order','class','phylum']
         ### ~ [2] Normalise to reduced levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for rank in ranks:
             self.printLog('\r#RANK','Normalising %s data.' % rank)
             taxsum = {}; ranksum = 0.0  # Summed counts for taxa and rank total
             taxwt = {}; wtsum = 0.0     # Bootstrap-weighted summed counts for taxa and rank total
             bootsum = {}; bootx = {}    # Sum and count of bootstrap values for mean boot numbers
             for entry in rankdb.entries():
                 taxa = string.split(entry[rank],'|')
                 for taxon in taxa:
                     if taxon in self.list['TaxFilter']: continue
                     if taxon not in taxsum:
                         taxsum[taxon] = 0.0; taxwt[taxon] = 0.0
                         bootsum[taxon] = 0.0; bootx[taxon] = 0
                     taxsum[taxon] += 1.0 / len(taxa)
                     ranksum += 1.0 / len(taxa)
                     taxweight = entry['boot']
                     bootsum[taxon] += entry['boot']; bootx[taxon] += 1
                     taxwt[taxon] += taxweight / len(taxa)
                     wtsum += taxweight / len(taxa)
             otherx = 0
             for taxon in rje.sortKeys(taxsum):
                 if taxon == 'Other': continue
                 if taxsum[taxon] < self.getNum(minsum):
                     if 'Other' not in taxsum:
                         taxsum['Other'] = 0.0
                         taxwt['Other'] = 0.0
                         bootsum['Other'] = 0.0
                         bootx['Other'] = 0.0
                     taxsum['Other'] += taxsum.pop(taxon)
                     taxwt['Other'] += taxwt.pop(taxon)
                     bootsum['Other'] += bootsum.pop(taxon)
                     bootx['Other'] += bootx.pop(taxon)
                     otherx += 1
             self.printLog('#MINSUM','%s %s taxa converted to "Other" (count < minsum=%s)' % (rje.iStr(otherx),rank,self.getNum(minsum)))
             for taxon in taxsum: sumdb.addEntry({'rank':rank,'taxon':taxon,'count':rje.dp(taxsum[taxon],1),
                                                  'perc':rje.sf(100.0*taxsum[taxon]/ranksum),
                                                  'bootwt':rje.dp(taxwt[taxon],1),'meanboot':rje.dp(bootsum[taxon]/bootx[taxon],3),
                                                  'wtperc':rje.sf(100.0*taxwt[taxon]/wtsum)})
         ## ~ [2a] Rank taxa by counts such that highest is Rank 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         sumdb.rankFieldByIndex('rank','count',rev=True,absolute=True,lowest=True)
         sumdb.rankFieldByIndex('rank','bootwt',rev=True,absolute=True,lowest=True)
         ## ~ [2b] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         sumdb.saveToFile()
     except: self.errorLog('%s.summaryScores error' % self.prog())
Пример #41
0
 def tidyMotifNames(self,dbtable):    ### Tidy the motif names in given dbtable
     '''Tidy the motif names in given dbtable.'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         slist = self.obj['SLiMList']
         if 'motif' not in dbtable.fields(): return 0
         mx = 0
         for entry in dbtable.entries():
             newname = slist.slimCoreName(entry['motif'])
             if newname != entry['motif']: entry['motif'] = newname; mx += 1
         self.printLog('#MOTIF','%s motif names corrected for SLiMList splitting.' % rje.iStr(mx))
         if mx: dbtable.remakeKeys()
         return mx
     except: self.errorLog('Problem during %s tidyMotifNames.' % self.prog()); raise
Пример #42
0
 def difference(self, table1,
                table2):  ### Generates differences as new table
     '''
     Generates differences as new table.
     >> table1:Table = iTunes database table to compare
     >> table2:Table = iTunes database table to compare
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         dfields = [
             'Name', 'Artist', 'Composer', 'Album', 'Album_Artist', 'Genre',
             'Time', 'Disc Number', 'Disc Count', 'Track Number',
             'Track Count', 'Year', 'Date Added', 'Plays', 'Last Played',
             'Skips', 'Last Skipped', 'My Rating', 'Location', 'Tracks',
             'Score'
         ]
         db = self.db()
         tabindex = '#Artist#|#Album#|#Track Number#|#Name#'
         try:
             age1 = string.atoi(string.split(table1.name(), '.')[-1])
             age2 = string.atoi(string.split(table2.name(), '.')[-1])
             table1.index(tabindex, make=True)
             table2.index(tabindex, make=True)
             if age1 < age2:
                 oldtable = table1
                 newtable = table2
                 newdate = age2
             else:
                 newtable = table1
                 oldtable = table2
                 newdate = age1
             diftable = db.copyTable(
                 newtable, '%s-%s' %
                 (oldtable.name(), string.split(newtable.name(), '.')[-1]))
             diftable.keepFields(dfields + [tabindex])
             diftable.addField('Status')
         except:
             self.errorLog('Cannot generate differences for %s and %s' %
                           (table1, table2))
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog(
             '#NEW', '%s tracks in new iTunes export.' %
             rje.iStr(newtable.entryNum()))
         self.printLog(
             '#OLD', '%s tracks in old iTunes export.' %
             rje.iStr(oldtable.entryNum()))
         oldfiles = oldtable.datakeys()[0:]
         for entry in diftable.entries():
             ## ~ [2a] Find pair of entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if entry['Location'] in oldfiles:
                 oldentry = oldtable.data(entry['Location'])
             elif entry[tabindex] in oldtable.index(tabindex):
                 oldentry = oldtable.indexEntries(tabindex,
                                                  entry[tabindex])[0]
                 if len(oldtable.indexEntries(tabindex,
                                              entry[tabindex])) == 1:
                     pass
                 else:
                     self.printLog(
                         '#DUP',
                         'Duplicate entries for %s' % entry[tabindex])
                     for ientry in oldtable.indexEntries(
                             tabindex, entry[tabindex]):
                         if ientry['Location'] in oldfiles:
                             oldentry = ientry
                             break
             else:
                 oldentry = None
             #self.deBug(entry)
             #self.deBug(oldentry)
             ## ~ [2b] Generate Differences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if not oldentry:
                 entry['Status'] = 'New'
                 continue
             #self.deBug(oldentry['Location'] in oldfiles)
             if oldentry['Location'] in oldfiles:
                 oldfiles.remove(oldentry['Location'])
             #self.deBug(len(oldfiles))
             changed = False
             for field in ['Plays', 'Skips', 'My Rating']:
                 if entry[field] != oldentry[field]: changed = True
                 try:
                     entry[field] -= oldentry[field]
                 except:
                     pass  # Keep new value - probably empty in old entry
             if changed: entry['Status'] = 'Changed'
             else: entry['Status'] = 'Unchanged'
         ### ~ [3] Add missing old entries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         reportdel = rje.yesNo('Report deleted %s tracks?' %
                               diftable.name())
         for old in oldfiles:
             entry = diftable.addEntry(oldtable.data(old))
             entry['Status'] = 'Deleted'
             if reportdel:
                 self.printLog(
                     '#DEL', '%s: %s [%s]' %
                     (entry['Artist'], entry['Name'], entry['Album']))
         ### ~ [4] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for status in rje.sortKeys(diftable.index('Status')):
             self.printLog(
                 '#STAT', '%s: %d tracks' %
                 (status.upper(), len(diftable.index('Status')[status])))
         self.printLog('#TRACK',
                       '%s tracks in total' % rje.iStr(diftable.entryNum()))
         self.deBug('?')
         for table in [table1, table2, diftable]:
             table.dropField(tabindex)
         diftable.saveToFile('%s.tdt' % diftable.name())
     except:
         self.errorLog('%s.difference() error' % self)
Пример #43
0
 def codonUsageEntropyBias(self):   ### Calculate bias in Codon Usage using Entropy-based measure
     '''Calculate bias in Codon Usage using Entropy-based measure.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         aacode = self.db('Code').index('AA')
         nt = ['C','A','G','U']; codons = rje.sortKeys(rje_sequence.genetic_code)
         cdb = self.db('Codons'); edb = self.db('Expected')
         ## ~ [1a] Setup bias table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         bdb = self.db().addEmptyTable('Bias',['Seq','Len','Bias','ExpBias','WtBias','ExpWtBias'],['Seq'])
         ### ~ [2] Calculate Frequencies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         x = 0.0; etot = cdb.entryNum()
         for codentry in cdb.entries():
             self.progLog('\r#BIAS','Calculating Bias: %.2f%%' % (x/etot)); x += 100.0
             expentry = edb.data(codentry['Seq'])
             entry = {'Seq':codentry['Seq'],'Len':codentry['Len'],'Bias':0.0,'ExpBias':0.0,'WtBias':0.0,'ExpWtBias':0.0}
             aafreq = {}
             for aa in aacode:
                 aafreq[aa] = 0.0
                 for code in aacode[aa]: aafreq[aa] += codentry[code]
             rje.dictFreq(aafreq,total=False)
             for aa in aacode:
                 entry['Bias'] += rje.entropyDict(codentry,aacode[aa])
                 entry['ExpBias'] += rje.entropyDict(expentry,aacode[aa])
                 entry['WtBias'] += (aafreq[aa] * rje.entropyDict(codentry,aacode[aa]))
                 entry['ExpWtBias'] += (aafreq[aa] * rje.entropyDict(expentry,aacode[aa]))
             bdb.addEntry(entry)
         self.printLog('\r#BIAS','Codon Usage entropy bias calculated for %s entries' % rje.iStr(etot))
         bdb.saveToFile()
     except: self.errorLog('%s.expectedCodonUsage error' % self)
Пример #44
0
    def combineSNPs(self):  ### Calculates statistics of genetic differences from parsed PileUp Tables
        '''Calculates statistics of genetic differences from parsed PileUp Tables.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not self.list['SNPTables']: self.printLog('\r#SNP','No SNP tables to add.'); return False
            fdb = self.db().addTable(name='fdr',expect=True,mainkeys=['Locus','Pos'])
            fdb.remakeKeys()   #!# Delete once tuple thing OK
            fdbkeys = fdb.dataKeys()
            self.debug(fdbkeys[:100])
            snps = []
            snppos = []
            for snptable in self.list['SNPTables']:
                snps.append(self.db().addTable(snptable,name=rje.baseFile(snptable,True),expect=True,mainkeys=['Locus','Pos']))
                snps[-1].addField('SNP',evalue="YES")
                self.debug(snps[-1].dataKeys()[:100])
                snps[-1].remakeKeys()   #!# Delete once tuple thing OK
                self.debug(snps[-1].dataKeys()[:100])
                px = 0; ptot = snps[-1].entryNum(); sx = 0
                for pos in snps[-1].dataKeys(): # This should be a (Locus,Pos) tuple
                    self.progLog('\r#SNP','Scanning %s for extra SNP positions: %.2f%%' % (snps[-1].name(),px/ptot)); px += 100.0
                    if pos not in snppos + fdbkeys: snppos.append(pos); sx += 1
                self.printLog('\r#SNP','Scanned %s for extra SNP positions: %s to add.' % (snps[-1].name(),rje.iStr(sx)))
            ## ~ [0a] Add missing data from other tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if snppos:
                SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r'); px = 0; ptot = len(snppos); ix = 0
                fline = SAMSIG.readline(); headers = rje.readDelimit(fline)
                fline = SAMSIG.readline()
                self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix)))
                while fline:
                    data = rje.readDelimit(fline); ix += 1
                    if (data[0],data[1]) in snppos:
                        entry = {'p.FDR':'-'}
                        for i in range(len(data)): entry[headers[i]] = data[i]
                        fdb.addEntry(entry); px += 1
                        snppos.remove((data[0],data[1]))
                        self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix)))
                    else: self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix)))
                    if not snppos: break
                    fline = SAMSIG.readline()
                SAMSIG.close()
                self.printLog('\r#SNP','%s/%s SNP positions added from PDiff file.' % (rje.iStr(px),rje.iStr(ptot)))
            else: self.printLog('\r#SNP','No SNP positions to add.'); return False

            ### ~ [1] Join Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            temp = fdb
            temp.makeField('#Locus#|#Pos#')
            for snptable in snps:
                snptable.makeField('#Locus#|#Pos#')
                newtemp = self.db().joinTables(name='newtemp',join=[(temp,'#Locus#|#Pos#'),(snptable,'#Locus#|#Pos#',['SNP'])],newkey=['Locus','Pos'],keeptable=True)
                self.printLog('#SNP','Added SNPs from %s' % snptable.name())
                self.db().deleteTable(temp)
                temp = newtemp
                temp.renameField('SNP',snptable.name())
                temp.setStr({'Name':'temp'})
            temp.dropField('#Locus#|#Pos#')
            self.db().list['Tables'].append(temp)
            temp.setStr({'Name':'SNPs'})
            temp.saveToFile()
            return temp
        except: self.errorLog('%s.pileUpStats() error' % (self)); return None
Пример #45
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
         seqcmd = self.cmd_list + ['autoload=T','seqmode=file','seqindex=T']
         dfile = '%s.data.tdt' % self.basefile()
         ### ~ [2] Load Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.dict['SeqList']['full'] = rje_seqlist.SeqList(self.log,seqcmd + ['seqmode=list'])
         self.debug(self.dict['SeqList']['full'].seqNum())
         if self.dict['SeqList']['full'].seqNum(): return
         self.dict['SeqList']['full'] = rje_seqlist.SeqList(self.log,seqcmd + ['seqin=%s.full.fas' % (self.basefile()),'seqmode=list'])
         for stype in ['CDS','gene','prot']:
             seq = self.dict['SeqList'][stype] = rje_seqlist.SeqList(self.log,seqcmd + ['seqin=%s.%s.fas' % (self.basefile(),stype)])
             seq.dict['SeqDict'] = {}
             for s in seq.list['Seq']:
                 (name,sequence) = seq.getSeq(s)
                 seq.dict['SeqDict'][string.split(string.split(name)[0],'_')[-1]] = s
         ### ~ [3] Database Compilation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if rje.exists(dfile) and not self.getBool('Force'): db.addTable(dfile,name='data',mainkeys=['tag'])
         else:
             ## ~ [3a] ~ Load part tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             fdb = db.addTable('%s.function.tdt' % self.basefile(),name='function',mainkeys=['tag'])
             fdb.dropField('description')
             edb = db.addTable('%s.expression.tdt' % self.basefile(),name='expression',mainkeys=['key'])
             nx = 0
             edb.fillBlanks(blank='0',fillempty=True)
             for ekey in rje.sortKeys(edb.data()):
                 entry = edb.data(ekey)
                 for field in edb.fields():
                     if entry[field] == 'na': entry[field] = '0.0'; nx += 1
             self.printLog('#TDT','Updated %s entries for expression table' % rje.iStr(nx))
             kdb = db.addTable('%s.proteinkey.tdt' % self.basefile(),name='proteinkey',mainkeys=['key'])
             xdb = db.addTable('%s.dbxref.tdt' % self.basefile(),name='dbxref',mainkeys=['tag'])
             xdb.dropField('gene')   # Pull from genbank instead
             #pdb = db.addTable('%s.cysweight.tdt' % self.basefile(),name='cysweight',mainkeys=['AccNum'])
             pdb = db.addTable('%s.protein.tdt' % self.basefile(),name='prodigis',mainkeys=['AccNum'])
             pdb.addField('NRPep5','NRPep',0); pdb.addField('NRPep7','NRPep5',0)
             for x in range(5,51):
                 xfield = '%d' % x
                 if xfield not in pdb.fields(): continue
                 for entry in pdb.entries():
                     entry['NRPep5'] += int(entry[xfield])
                     if x >= 7: entry['NRPep7'] += int(entry[xfield])
             for field in pdb.fields()[0:]:
                 if field not in ['AccNum','File','ProtMWt','PepCount','LenExp','Len3','Len5','Len7Exp','Len37','NRPep','NRPep5','NRPep7','Cys0']: pdb.dropField(field)
             #pdb.renameField('AccNum','uniprot')
             #pdb.newKey(['uniprot'])
             pdb.renameField('AccNum','tag')
             pdb.newKey(['tag'])
             mdb = db.addTable('%s.PNASmaintable.tdt' % self.basefile(),name='main',mainkeys=['tag'])
             tdb = db.addTable('%s.tmhmm.tdt' % self.basefile(),name='TMHMM',mainkeys=['acc_num'])
             ## ~ [3b] ~ Load and process features table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             gdb = db.addTable('%s.Feature.tdt' % self.basefile(),name='feature',mainkeys=['locus','feature','position'])
             gdb.dropEntriesDirect('feature',['CDS'],inverse=True)
             gdb.list['Fields'] += ['tag','start','end','gene','product']
             for entry in gdb.entries():
                 pos = rje.matchExp('(\d+)\.\.(\d+)',entry['position'])
                 if entry['position'][:4] == 'comp': entry['start'] = pos[1]; entry['end'] = pos[0]
                 else: entry['start'] = pos[0]; entry['end'] = pos[1]
                 try: entry['tag'] = rje.matchExp('locus_tag="(\S+)"',entry['details'])[0]
                 except: entry['tag'] = '-'
                 try: entry['gene'] = rje.matchExp('gene="(\S+)"',entry['details'])[0]
                 except: entry['gene'] = ''
                 try: entry['product'] = string.split(string.split(entry['details'],'/product="')[1],'"')[0]
                 except: entry['product'] = ''
             gdb.dropEntriesDirect('tag',['-'])
             gdb.newKey(['tag'])
             for field in ['locus','feature','position','details']: gdb.dropField(field)
             ## ~ [3c] ~ Codon Bias Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             cfile = '%s.CDS.Bias.tdt' % self.basefile()
             if not rje.exists(cfile) or self.getBool('Force'):
                 rje_codons.Codons(self.log,self.cmd_list+['seqin=%s.CDS.fas' % self.basefile(),'backups=F']).run()
             bdb = db.addTable(cfile,name='Bias',mainkeys=['Seq'])
             bdb.renameField('Len','AALen')
             ndb = db.addTable('%s.CDS.NT.tdt' % self.basefile(),name='NT',mainkeys=['Seq'])
             ndb.renameField('Len','NTLen')
             for field in ndb.fields():
                 if field != string.replace(field,'U','T'): ndb.renameField(field,string.replace(field,'U','T'))
             ## ~ [3d] ~ Join tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             temp = db.joinTables(name='temp',join=[(edb,'key'),(kdb,'key')],newkey=['key'],cleanup=True,keeptable=True)
             #pfields = pdb.fields()[0:]
             #pfields.remove('uniprot')
             #temp2 = db.joinTables(name='temp2',join=[(xdb,'uniprot'),(pdb,'uniprot',pfields)],newkey=['tag'],cleanup=True,keeptable=True)
             #data = db.joinTables(name='data',join=[(temp2,'tag'),(fdb,'tag'),(gdb,'tag'),(bdb,'Seq'),(ndb,'Seq'),(temp,'tag'),(mdb,'tag')],newkey=['tag'],cleanup=True,keeptable=True)
             data = db.joinTables(name='data',join=[(pdb,'tag'),(xdb,'tag'),(fdb,'tag'),(tdb,'acc_num'),(gdb,'tag'),(bdb,'Seq'),(ndb,'Seq'),(temp,'tag'),(mdb,'tag')],newkey=['tag'],cleanup=True,keeptable=True)
             data.dropField('Seq')
             ## ~ [3e] ~ Fill out data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             data.fillBlanks(blank='0.0',fields=['eb','rb'],fillempty=True)
             #for entry in data.entries():
             #    if entry['tag'] not in self.dict['SeqList']['CDS'].dict['SeqDict']: entry['function'] = 'Non-CDS'
             data.fillBlanks(blank='Unassigned',fields=['function'],fillempty=True)
             data.fillBlanks()
             data.fillBlanks(blank='no mapping',fields=['description'],fillempty=True)
             data.saveToFile(dfile)
             allfields = data.list['Fields'][0:]
             data.list['Fields'] = ["tag","File","PepCount","LenExp","Len3","Len5","Len7Exp","Len37","NRPep",'NRPep5','NRPep7',"Cys0",
                                    "pi","mass","function","new_function","tm","start","end","AALen","Bias",
                                    "WtBias","AbsBias",'NTLen','C','A','G','T','C|3','A|3','G|3','T|3',
                                    'eb_1.1','eb_1.2','eb_2.1','eb_2.2','rb_1.1','rb_1.2','rb_2.1','rb_2.2','eb','rb']
             data.saveToFile('%s.cutdata.tdt' % self.basefile())
             data.list['Fields'] = allfields
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Пример #46
0
 def seqSubset2(self):    ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),mainkeys=['Ingolia'],name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile,mainkeys=['Gene'],name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log,self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(['sourcedata=%s' % xfile,'aliases=%s' % afile])
             ing_genes = string.split(string.join(self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes: ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog('#MUSG','%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes),rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog('#MAP','Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene); ing_mgi.remove(gene)
             self.printLog('#BAD','Failed to map %s genes from Ignolia' % rje.iLen(bad_genes))
             open('ingolia.bad.txt','w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),'EnsEMBL',sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog('#MUSG','%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes),rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna','pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'seqout=%s' % seqout,'autofilter=T','autload=T','seqmode=file','gooddesc=%s' % string.join(ing_musg,',')]
                     rje_seqlist.SeqList(self.log,seqcmd)
             mdb = self.db().addEmptyTable('map',['Ingolia','Gene','EnsEMBL'],['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia':gene,'Gene':ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'autofilter=F','autload=T','seqmode=file']
         iseq = rje_seqlist.SeqList(self.log,seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST',evalue='')
             while iseq.nextSeq():
                 (iname,icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)',iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL',musg):
                     if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0]
                     else: entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]':'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST'); sdb.addField('ENSP'); sdb.addField('ENSI');
         ENST = open('IngExact.cdna.all.fa','w')
         ENSP = open('IngExact.pep.all.fa','w')
         ex = 0.0; etot = sdb.entryNum(); sx = 0; fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog('\r#ING','Mapping Ignolia Harrington Starts: %.2f%%' % (ex/etot)); ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']: fx += 1; continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'],','):
                 (tname,tseq) = iseq.getDictSeq(trans,format='tuple')
                 self.deBug('%s vs %s' % (tseq[entry[icod]-3:][:7],entry[icon]))
                 if tseq[entry[icod]-3:][:7] == entry[icon]:
                     ipept = string.split(rje_sequence.dna2prot(tseq[entry[icod]:]),'*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog('\r#ING','Unable to find Harrington start for %s %s (%s)' % (gene,entry[icod],entry[icon]),screen=False)
                 fx += 1; continue
             elif len(peptseq) < minpep:
                 self.printLog('\r#ING','Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene,entry[icod],entry[icon]),screen=False)
                 fx += 1; continue
             id = rje.preZero(int(ex/100),etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write('>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],entry['Gene'],mentry['Gene'],cdnaseq))
             ENSP.write('>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],id,entry['Gene'],mentry['Gene'],peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close(); ENSP.close()
         self.printLog('\r#ING','Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx),rje.iStr(fx)))
         return
     except: self.errorLog('%s.method error' % self)
Пример #47
0
    def taxaMap(self):  ### Maps species codes onto different taxonomic ranks.
        '''Maps species codes onto different taxonomic ranks.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            db = self.db()
            tax = self.obj['Taxonomy']
            ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            specdb = self.db('spcode')
            #descdb = self.db('protdesc')
            ranks = ['genus', 'family', 'order', 'class', 'phylum']
            rankmap = {}  # SPCODE to Taxon dictionary
            rankfields = ['protein'] + ranks + specdb.fields()[1:]
            #if descdb: rankfields.append('desc')
            if self.getStrLC('ProtDesc'):
                rankfields.append('desc')
                px = 0
                for prot in self.dict['ProtDesc']:
                    if prot.lower() in ['', 'protein', 'gene']: continue
                    pentry = {
                        'protein': prot,
                        'spcode': 'None',
                        'boot': self.getNum('NoneBoot')
                    }
                    pkey = specdb.makeKey(pentry)
                    if pkey not in specdb.dataKeys():
                        specdb.addEntry(pentry)
                        px += 1
                self.printLog(
                    '#PROT', 'Added %s proteins from %s without trees.' %
                    (rje.iStr(px), self.getStr('ProtDesc')))
            rankdb = db.addEmptyTable('taxamap', rankfields, ['protein'])
            for rank in ranks:
                rankmap[rank] = {
                    'None': 'None',
                    'Unmapped': 'Unmapped',
                    'Uncertain': 'Uncertain'
                }
            taxdb = db.addEmptyTable('taxa',
                                     ['spcode', 'taxid', 'name'] + ranks,
                                     ['spcode'])

            sx = 0.0
            stot = specdb.entryNum()
            for entry in specdb.entries():
                self.progLog('\r#SPEC',
                             'Processing species: %.2f%%' % (sx / stot))
                sx += 100.0
                #if descdb:
                #try: entry['desc'] = descdb.data(descdb.makeKey(entry))['description']
                try:
                    entry['desc'] = self.dict['ProtDesc'][entry['protein']]
                except:
                    entry['desc'] = ''
                for spcode in string.split(entry['spcode'], '|'):
                    if spcode in rankmap['genus']: continue
                    tentry = {'spcode': spcode}
                    try:
                        taxid = tax.mapToTaxID(spcode,
                                               nodeonly=True,
                                               warn=False)[0]
                        rank = tax.dict['Rank'][taxid]
                        tentry['taxid'] = taxid
                        tentry['name'] = tax.getSpecies(taxid)
                    except:
                        self.warnLog(
                            'Unable to map species code "%s" to TaxID -> "Unmapped"'
                            % spcode)
                        taxid = 'Unmapped'
                        rank = 'genus'
                    # Loop through different ranks
                    for ri in range(len(ranks)):
                        nextrank = ranks[ri]
                        while rank not in ranks[ri:] and taxid in tax.dict[
                                'Parent']:
                            taxid = tax.dict['Parent'][taxid]
                            rank = tax.dict['Rank'][taxid]
                            #self.debug('%s: %s' % (tax.dict['Rank'][taxid],tax.getSpecies(taxid)))
                        if taxid in tax.dict['Parent']:
                            taxon = tax.getSpecies(taxid)
                        else:
                            taxon = 'Unmapped'
                        if rank != nextrank:
                            if self.getBool('Monophyly'): taxon = 'Uncertain'
                            else: taxon = '%s %s.' % (taxon, nextrank[:3])
                        rankmap[nextrank][spcode] = taxon
                        tentry[nextrank] = taxon
                    taxdb.addEntry(tentry)
                rentry = {}
                for nextrank in ranks:
                    taxa = []
                    unmapped = ''
                    for spcode in string.split(entry['spcode'], '|'):
                        ranktax = rankmap[nextrank][spcode]
                        if 'unmapped' in ranktax.lower(
                        ) and ranktax not in taxa:
                            if unmapped:
                                self.warnLog('Two Unmapped %s taxa: %s & %s' %
                                             (nextrank, unmapped, ranktax))
                            unmapped = ranktax  #i# Should only be one
                        if ranktax not in taxa: taxa.append(ranktax)
                    if len(taxa) > 1 and 'None' in taxa:
                        self.warnLog('None in: %s' %
                                     string.join(rje.sortUnique(taxa), '|'))
                        taxa.remove('None')
                    if len(taxa) > 1 and unmapped: taxa.remove(unmapped)
                    if len(taxa) > 1 and self.getBool('Monophyly'):
                        rentry[nextrank] = 'Uncertain'
                    else:
                        rentry[nextrank] = string.join(rje.sortUnique(taxa),
                                                       '|')
                rankdb.addEntry(rje.combineDict(rentry, entry))
            self.printLog(
                '\r#SPEC',
                '%s proteins with species codes processed.' % rje.iStr(stot))
            rankdb.saveToFile()
            taxdb.saveToFile()
        except:
            self.errorLog('%s.taxaMap error' % self.prog())
Пример #48
0
 def _positiveAndNegativePeptides(self): ### Populates PosPep and NegPep Lists
     '''Populates PosPep and NegPep Lists.'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pfile = '%s.peptides.tdt' % self.basefile()
         #if rje.exists(pfile) and not self.getBool('Force'):
         #    try:
         #        pdb = self.db().addTable(pfile,['Peptide'],name='Peptides')
         #        pdb.dataFormat(reformat={'Len':'int','MWt':'num','Cys':'int','Ser':'int','Hyd':'num'})
         #        self.list['Peptides'] = self.list['PosPep'] = pdb.index('Pos')['Y']
         #        self.list['NegPep'] = pdb.index('Positive')['Neg']
         #        return pdb
         #    except: pass
         if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False
         self.list['Peptides'] = peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True)
         seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T'])
         pdb = self.db().addEmptyTable('Peptides',['Peptide','NR','Pos','Len','MWt','C','HPW','DENQ','M','Hyd'],['Peptide'])
         ### ~ [1] ~ Digest Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         protease = self.getStr('PepCut')
         self.list['PosPep'] = poslist = []; self.list['NegPep'] = neglist = []; sx = 0.0; stot = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             frag = string.split(sequence,':')
             while '' in frag: frag.remove('')
             if not self.getBool('NTerm'): frag = frag[1:]
             for pep in frag[0:]:
                 if pep not in poslist: poslist.append(pep)
         self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist)))
         ## ~ [1b] ~ Peptide Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         allpep = []; self.list['Redundant'] = redundant = []
         sx = 0.0; stot = self.obj['SeqList'].seqNum() 
         for seq in self.obj['SeqList'].seqs():
             self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]:
                 sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             for frag in string.split(sequence,':'):
                 if frag in allpep: redundant.append(frag)
                 else: allpep.append(frag)
         self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (protease,rje.iStr(stot)))   
         ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         px = 0.0; ptot = len(poslist)
         for pep in poslist[0:]:
             self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0
             entry = {'Peptide':pep,'MWt':rje_sequence.MWt(pep),'Hyd':rje_sequence.eisenbergHydropathy(pep,returnlist=False),
                      'Len':len(pep),'NR':'Y','Pos':'Y'}
             if pep not in peplist: poslist.remove(pep); neglist.append(pep); entry['Pos'] = 'N'
             if pep in redundant: entry['NR'] = 'N'
             for aacomb in ['C','HPW','DENQ','M']:
                 x = 0
                 for a in aacomb: x += pep.count(a)
                 entry[aacomb] = x
             pdb.addEntry(entry)
         self.printLog('\r#PEP','Processing positive peptides (%s) complete: %s Pos; %s Neg.' % (protease,rje.iLen(poslist),rje.iLen(neglist)))
         ### ~ [2] ~ Save Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pdb.saveToFile(pfile)
         POS = open('%s.positives.fas' % self.basefile(),'w'); NEG = open('%s.negatives.fas' % self.basefile(),'w')
         for pep in poslist: POS.write('>%s\n%s\n' % (pep,pep))
         for pep in neglist: NEG.write('>%s\n%s\n' % (pep,pep))
         POS.close(); self.printLog('#FAS','%s peptides output to %s.positives.fas' % (rje.iLen(poslist),self.basefile()))
         NEG.close(); self.printLog('#FAS','%s peptides output to %s.negatives.fas' % (rje.iLen(neglist),self.basefile()))
         return pdb
     except: self.errorLog('Problem during %s._positiveAndNegativePeptides().' % self); return None  # Setup failed
Пример #49
0
 def mapTaxa(self,taxin,taxout=['spcode'],nodeonly=False,rankonly=False,savetaxout=True):    ### Takes a list of Taxa and returns mapped Taxa data
     '''
     Takes a list of Taxa and returns mapped Taxa data.
     >> taxin:str or list of taxon identifiers to map from.
     >> taxout:str or list of taxa output formats
     >> nodeonly:bool = whether to limit TaxID mapping to the precise matching nodes (else include children)
     >> rankonly:bool = whether to limit TaxID to those matching self.list['RankTypes'] taxon types.
     >> savetaxout:bool [True] = Whether to save the TaxOut list to a text file
     << taxoutlist:list of mapped taxa if taxout is a string, OR
     << taxoutdict:dict of mapped taxa if taxout is a list
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tlist = True
         try: taxout.sort()
         except: tlist = False
         if tlist:
             if not taxout: return {}
             taxout = [taxout]
         elif not taxout: return []
         ### ~ [2] ~ Map to TaxID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxid = self.mapToTaxID(self.list['TaxIn'],nodeonly,rankonly)
         if self.list['RestrictID']:
             tx = len(taxid)
             taxid = rje.listIntersect(taxid,self.list['RestrictID'])
             self.printLog('#TAXID','%s of %s TaxID in %s Restricted IDs.' % (rje.iLen(taxid),rje.iStr(tx),rje.iLen(self.list['RestrictID'])))
         ### ~ [3] ~ Map TaxID and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxdict = {}; taxoutdict = {}
         for taxout in self.list['TaxOut']:
             taxout = taxout.lower()
             if taxout == 'taxid':
                 taxoutlist = taxid
             elif taxout in ['spcode','name','common']:
                 if not taxdict: taxdict = self.taxDict(taxid)
                 taxoutlist = []
                 for t in taxid:
                     try: taxoutlist.append(taxdict[t][taxout])
                     except: self.warnLog('No "%s" data for TaxID %s' % (taxout, t),'Missing_%s' % taxout,suppress=True)
                 taxoutlist.sort()
             else: self.errorLog('TaxOut format "%s" not recognised' % taxout,printerror=False); continue
             taxoutdict[taxout] = taxoutlist
             if savetaxout:
                 if not taxoutlist: self.printLog('#OUT','No %s IDs to output' % taxout); continue
                 tfile = '%s.%s.txt' % (self.baseFile(),taxout)
                 rje.backup(self,tfile)
                 open(tfile,'w').write(string.join(taxoutlist,'\n'))
                 self.printLog('#OUT','%s %s IDs output to %s.' % (rje.iLen(taxoutlist), taxout, tfile))
         if tlist: return taxoutdict
         return taxoutlist
     except: self.errorLog('Problem during %s mapTaxa.' % self); raise
Пример #50
0
 def iTRAQSamples(self): ### Uses self.dict['Samples'] and self.db('itraq') to summarise hit data
     '''Uses self.dict['Samples'] and self.db('itraq') to summarise hit data.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db(); idb = self.db('itraq')
         mdb = db.copyTable(idb,'itraq_summary')
         gdb = db.copyTable(idb,'itraq_geomean')
         ### ~ [1] Reformat Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         mdb.dropField('geomean'); gdb.dropField('ratio'); gdb.renameField('geomean','ratio')
         for sdb in [mdb,gdb]:
             sdb.dropField('summary');
             sdb.dropEntriesDirect('ratio','---')
             sdb.dropEntriesDirect('ratio','NN')
             sdb.dataFormat({'ratio':'num','n':'int'})
             ## ~ [1a] Drop tags with Samples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             (ex,etot) = (0.0,sdb.entryNum())
             for entry in sdb.entries():
                 self.progLog('\r#ITRAQ','Drop isotags without Sample info: %.2f%%' % (ex/etot)); ex += 100.0
                 tags = string.split(entry['itraq'],'/')
                 if tags[0] not in self.dict['Samples'] or tags[1] not in self.dict['Samples']: sdb.dropEntry(entry)
             self.printLog('\r#ITRAQ','Dropped all isotags without Sample info: %s of %s entries remain' % (rje.iStr(sdb.entryNum()),rje.iStr(etot)))
             ## ~ [1b] Reshape, rename, invert and remove redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             sdb.reshapeWide('itraq',['ratio','n'])
             samples = rje.sortUnique(self.dict['Samples'].values())
             ratios = []
             self.printLog('#SAMP',string.join(samples,', '))
             for s1 in samples:
                 for s2 in samples[samples.index(s1):]:
                     newfield = '%s/%s' % (s1,s2)
                     sdb.addField(newfield)
                     sdb.addField('%s_Min' % newfield)
                     sdb.addField('%s_Max' % newfield)
                     sdb.addField('%s_Dirn' % newfield)
                     ratios.append(newfield)
                     for entry in sdb.entries(): entry[newfield] = []
             for field in sdb.fields():
                 if '|' in field:
                     (score,tags) = string.split(field,'|')
                     tag = string.split(tags,'/')
                     if int(tag[0]) > int(tag[1]):   ### Invert
                         newfield = '%s|%s/%s' % (score,tag[1],tag[0])
                         if newfield in sdb.fields(): sdb.dropField(newfield); continue
                         sdb.renameField(field,newfield)
                         if score == 'ratio':
                             for entry in sdb.entries():
                                 if entry[newfield]: entry[newfield] = 1.0 / entry[newfield]
                         tag = (tag[1],tag[0])
                         field = newfield
                     s1 = self.dict['Samples'][tag[0]]
                     s2 = self.dict['Samples'][tag[1]]
                     newname = '%s|%s%s/%s%s' % (score,s1,tag[0],s2,tag[1])
                     sdb.renameField(field,newname)
                     if score == 'n': continue
                     newfield = '%s/%s' % (s1,s2)
                     invfield = '%s/%s' % (s2,s1)
                     for entry in sdb.entries():
                         if entry[newname] and newfield in sdb.fields(): entry[newfield].append(entry[newname])
                         elif entry[newname]: entry[invfield].append(1.0/entry[newname])
             ## ~ [1c] Calculate Geometric mean ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             (ex,etot) = (0.0,sdb.entryNum())
             for entry in sdb.entries():
                 self.progLog('\r#GEO','Calculating Geometric means: %.2f%%' % (ex/etot)); ex += 100.0
                 for ratio in ratios:
                     if entry[ratio]:
                         entry['%s_Min' % ratio] = min(entry[ratio])
                         entry['%s_Max' % ratio] = max(entry[ratio])
                         try: entry[ratio] = rje.geoMean(entry[ratio])
                         except: self.deBug(entry)
                         if entry[ratio] > 1 and entry['%s_Min' % ratio] > 1: entry['%s_Dirn' % ratio] = 'UP'
                         elif entry[ratio] < 1 and entry['%s_Max' % ratio] < 1: entry['%s_Dirn' % ratio] = 'DOWN'
                     else: entry['%s_Dirn' % ratio] = entry['%s_Min' % ratio] = entry['%s_Max' % ratio] = entry[ratio] = ''
             self.printLog('\r#GEO','Geometric mean calculations complete')
             sdb.saveToFile()
     except: self.errorLog('iTRAQSamples error')
Пример #51
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log,self.cmd_list+['tuplekeys=T'])
         if self.baseFile().lower() in ['','none']: self.baseFile('%s.vs.%s.Q%d' % (rje.baseFile(self.getStr('MutPileup'),True),rje.baseFile(self.getStr('WTPileup'),True),self.getInt('QCut')))
         if not self.force() and os.path.exists('%s.fdr.tdt' % self.baseFile()): return
         ### ~ [2] Look for/process WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.force() or not os.path.exists('%s.WT.tdt' % self.baseFile()): self.parsePileup('WT',self.getStr('WTPileup'))
         ### ~ [3] Generate Reference sequences and Major Alleles (by locus) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         refseq = {}; rx = 0
         majors = {}
         locus = None
         WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 0
         for line in WTDATA:
             self.progLog('\r#WT','Reading WT data: Reference seq length = %s nt' % (rje.iStr(rx)),rand=0.01)
             data = rje.readDelimit(line); wx += 1
             if data[0] == 'Locus': continue
             else:
                 if data[0] != locus: locus = data[0]; refseq[locus] = ''; majors[locus] = []
                 pos = int(data[1])
                 while (pos - 1) > len(refseq[locus]): refseq[locus] += '?'; rx += 1
                 while (pos - 1) > len(majors[locus]): majors[locus].append('-')
                 refseq[locus] += data[2]; majors[locus].append(data[5]); rx += len(data[2])
         WTDATA.close()
         self.printLog('\r#WT','%s lines read from WT data: Reference seq length = %s nt' % (rje.iStr(wx),rje.iStr(rx)))
         for locus in rje.sortKeys(majors):
             if len(majors[locus]) != len(refseq[locus]): self.errorLog('%s WTMajor versus RefSeq length mismatch!' % locus,printerror=False); raise ValueError
         self.dict['WTMajor'] = majors
         self.dict['RefSeq'] = refseq
         ### ~ [3] Look for/process Mutant Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.force() or not os.path.exists('%s.Mut.tdt' % self.baseFile()): self.parsePileup('Mut',self.getStr('MutPileup'),True)
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed