예제 #1
0
 def mapTaxa(self,taxin,taxout=['spcode'],nodeonly=False,rankonly=False,savetaxout=True):    ### Takes a list of Taxa and returns mapped Taxa data
     '''
     Takes a list of Taxa and returns mapped Taxa data.
     >> taxin:str or list of taxon identifiers to map from.
     >> taxout:str or list of taxa output formats
     >> nodeonly:bool = whether to limit TaxID mapping to the precise matching nodes (else include children)
     >> rankonly:bool = whether to limit TaxID to those matching self.list['RankTypes'] taxon types.
     >> savetaxout:bool [True] = Whether to save the TaxOut list to a text file
     << taxoutlist:list of mapped taxa if taxout is a string, OR
     << taxoutdict:dict of mapped taxa if taxout is a list
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tlist = True
         try: taxout.sort()
         except: tlist = False
         if tlist:
             if not taxout: return {}
             taxout = [taxout]
         elif not taxout: return []
         ### ~ [2] ~ Map to TaxID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxid = self.mapToTaxID(self.list['TaxIn'],nodeonly,rankonly)
         if self.list['RestrictID']:
             tx = len(taxid)
             taxid = rje.listIntersect(taxid,self.list['RestrictID'])
             self.printLog('#TAXID','%s of %s TaxID in %s Restricted IDs.' % (rje.iLen(taxid),rje.iStr(tx),rje.iLen(self.list['RestrictID'])))
         ### ~ [3] ~ Map TaxID and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxdict = {}; taxoutdict = {}
         for taxout in self.list['TaxOut']:
             taxout = taxout.lower()
             if taxout == 'taxid':
                 taxoutlist = taxid
             elif taxout in ['spcode','name','common']:
                 if not taxdict: taxdict = self.taxDict(taxid)
                 taxoutlist = []
                 for t in taxid:
                     try: taxoutlist.append(taxdict[t][taxout])
                     except: self.warnLog('No "%s" data for TaxID %s' % (taxout, t),'Missing_%s' % taxout,suppress=True)
                 taxoutlist.sort()
             else: self.errorLog('TaxOut format "%s" not recognised' % taxout,printerror=False); continue
             taxoutdict[taxout] = taxoutlist
             if savetaxout:
                 if not taxoutlist: self.printLog('#OUT','No %s IDs to output' % taxout); continue
                 tfile = '%s.%s.txt' % (self.baseFile(),taxout)
                 rje.backup(self,tfile)
                 open(tfile,'w').write(string.join(taxoutlist,'\n'))
                 self.printLog('#OUT','%s %s IDs output to %s.' % (rje.iLen(taxoutlist), taxout, tfile))
         if tlist: return taxoutdict
         return taxoutlist
     except: self.errorLog('Problem during %s mapTaxa.' % self); raise
예제 #2
0
 def batchSummarise(self):   ### Batch run seqlist summarise on batchrun=LIST files and output table of results
     '''
     Batch run seqlist summarise on batchrun=LIST files and output table of results
     '''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.list['BatchRun']: raise ValueError('Need to provide batchrun=LIST files for summarise mode.')
         db = rje_db.Database(self.log,self.cmd_list)
         self.printLog('#BASE',db.baseFile())
         sdb = None
         if not self.force():
             sdb = db.addTable(mainkeys=['File'],name='summarise',expect=False)
         if not sdb: sdb = db.addEmptyTable('summarise',['File'],['File'])
         ### ~ [2] Run Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#BATCH','Batch summarising %s input files' % rje.iLen(self.list['BatchRun']))
         for file in self.list['BatchRun']:
             seqdata = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=%s' % file,'autoload=T','summarise=F']).summarise()
             if seqdata:
                 if 'GC' in seqdata:
                     seqdata.pop('GC')
                     seqdata['GCPC'] = '%.2f' % seqdata['GCPC']
                 if 'GapLength' in seqdata: seqdata['GapPC'] = '%.2f' % (100.0*seqdata['GapLength']/seqdata['TotLength'])
                 seqdata['MeanLength'] = '%.1f' % seqdata['MeanLength']
                 for field in string.split('SeqNum, TotLength, MinLength, MaxLength, MeanLength, MedLength, N50Length, L50Count, GapLength, GapPC, GCPC',', '):
                     if field in seqdata and field not in sdb.fields(): sdb.addField(field)
                 for field in seqdata.keys():
                     if field not in sdb.fields(): sdb.addField(field)
                 sdb.addEntry(seqdata)
             else: self.errorLog('Summarise failed for %s' % file,printerror=False)
         ### ~ [3] Output Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb.saveToFile()
         return True
     except: self.errorLog('%s.batchSummarise error' % self); return False
예제 #3
0
 def classify(self):  ### Generate summary tables for each protein class
     '''Generate summary tables for each protein class.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         rankdb = self.db('taxamap')
         for cfile in self.list['Classify']:
             pclass = rje.baseFile(cfile, strip_path=True)
             clist = []
             for fline in open(cfile, 'r').readlines():
                 prot = string.split(rje.chomp(fline), maxsplit=1)[0]
                 if prot: clist.append(prot)
             self.printLog(
                 '#CLASS', '%s "%s" class proteins read from %s' %
                 (rje.iLen(clist), pclass, cfile))
             if not clist:
                 self.warnLog('No proteins read from %s' % (cfile))
                 continue
             classdb = db.copyTable(rankdb, pclass)
             classdb.dropEntriesDirect('protein', clist, inverse=True)
             if not classdb.entries():
                 self.warnLog('No "%s" proteins found in TaxaMap table' %
                              (pclass))
                 continue
             self.summaryScores(classdb, pclass, 'MinClass')
     except:
         self.errorLog('%s.classify() error' % self.prog())
예제 #4
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup Database ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log,self.cmd_list)
         db = self.db().addEmptyTable('ProDigIS',['AccNum','Protease','PepCount'],['AccNum','Protease'])
         if self.getInt('MinPepLen') > 0: db.addField('MinPepLen')
         if self.getBool('NRPep'): db.addField('NRPep')
         if rje.exists(self.getStr('Source')):
             fdb = self.db().addTable(self.getStr('Source'),mainkeys=['AccNum'],name='Source')
             fdb.addField('File')
             fdb.addField('ProtMWt')
         else: fdb = self.db().addEmptyTable('Source',['AccNum','File','ProtMWt'],['AccNum'])
         for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i)
         if self.getBool('PepMWt'):
             for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i*100.0)
         ### ~ [2] Load Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autoload=F'])
         self.obj['SeqList'].seq = fullseq = []
         for seqfile in self.list['SeqFiles']:
             file = rje.baseFile(seqfile,True)
             seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % seqfile,'autoload=T'])
             fullseq += seqlist.seqs()
             for seq in seqlist.seqs():
                 accnum = seq.getStr('AccNum')
                 try:
                     entry = fdb.data()[accnum]
                     if 'File' in entry and entry['File']: self.errorLog('%s found in %s AND %s!' % (accnum,entry['File'],file),printerror=False)
                     entry['File'] = file
                     entry['ProtMWt'] = seq.MWt()
                 except:
                     entry = {'AccNum':accnum,'File':file,'ProtMWt':seq.MWt()}
                     fdb.addEntry(entry)
                 self.deBug(fdb.dict['Data'][seq.getStr('AccNum')])
         self.printLog('#SEQ','%s sequences to analyse in total' % rje.iLen(fullseq))
         fdb.fillBlanks()
         ### ~ [3] Setup Peptide Probabilities ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self._peptideProbabilities():
             db.addField('LenExp','PepCount');
             if self.getBool('PepMWt'): db.addField('MWtExp','LenExp'); db.addField('Len7Exp','MWtExp')
             else: db.addField('Len7Exp','LenExp')
             db.addField('Len37','Len7Exp')
             if self.getBool('PepMWt'):
                 db.addField('Len5','MWtExp'); db.addField('MWt5','Len5')
                 db.addField('Len3','MWtExp'); db.addField('MWt3','Len3')
             else: db.addField('Len5','LenExp'); db.addField('Len3','LenExp')
         return
         ### ~ [4] Temp GABLAM Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdb = self.db().addTable('Chlam_Pos.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GABLAM')
         ndb = self.db().addTable('Chlam_Neg.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GNeg')
         self.db().mergeTables(gdb,ndb,overwrite=True,matchfields=True)
         gdb.renameField('Qry','AccNum')
         tmp = self.db().joinTables(name='blast',join=[('Source','AccNum'),('GABLAM','AccNum')],newkey=['AccNum','File'],keeptable=False)
         tmp.saveToFile()
         tmp.compress(['File'],default='mean')
         tmp.dropFields(['AccNum'])
         tmp.info['Name'] = 'blastsum'
         tmp.saveToFile()
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
예제 #5
0
 def batchSummarise(
     self
 ):  ### Batch run seqlist summarise on batchrun=LIST files and output table of results
     '''
     Batch run seqlist summarise on batchrun=LIST files and output table of results
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.list['BatchRun']:
             raise ValueError(
                 'Need to provide batchrun=LIST files for summarise mode.')
         db = rje_db.Database(self.log, self.cmd_list)
         self.printLog('#BASE', db.baseFile())
         sdb = None
         if not self.force():
             sdb = db.addTable(mainkeys=['File'],
                               name='summarise',
                               expect=False)
         if not sdb: sdb = db.addEmptyTable('summarise', ['File'], ['File'])
         ### ~ [2] Run Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog(
             '#BATCH', 'Batch summarising %s input files' %
             rje.iLen(self.list['BatchRun']))
         for file in self.list['BatchRun']:
             seqdata = rje_seqlist.SeqList(
                 self.log, self.cmd_list +
                 ['seqin=%s' % file, 'autoload=T', 'summarise=F'
                  ]).summarise()
             if seqdata:
                 if 'GC' in seqdata:
                     seqdata.pop('GC')
                     seqdata['GCPC'] = '%.2f' % seqdata['GCPC']
                 if 'GapLength' in seqdata:
                     seqdata['GapPC'] = '%.2f' % (100.0 *
                                                  seqdata['GapLength'] /
                                                  seqdata['TotLength'])
                 seqdata['MeanLength'] = '%.1f' % seqdata['MeanLength']
                 for field in string.split(
                         'SeqNum, TotLength, MinLength, MaxLength, MeanLength, MedLength, N50Length, L50Count, GapLength, GapPC, GCPC',
                         ', '):
                     if field in seqdata and field not in sdb.fields():
                         sdb.addField(field)
                 for field in seqdata.keys():
                     if field not in sdb.fields(): sdb.addField(field)
                 sdb.addEntry(seqdata)
             else:
                 self.errorLog('Summarise failed for %s' % file,
                               printerror=False)
         ### ~ [3] Output Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb.saveToFile()
         return True
     except:
         self.errorLog('%s.batchSummarise error' % self)
         return False
예제 #6
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         forkx = len(self.list['Forked'])
         self.setup()
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.forking()
         self.printLog('#FORK','Forking of %s jobs completed.' % (rje.iStr(forkx)))
     except:  self.errorLog('Forker.run() Error')
     if self.list['Forked']:
         self.warnLog('%s fork jobs remain unforked.' % rje.iLen(self.list['Forked']))
         return False
     return True
예제 #7
0
 def setup(self):    ### Main class setup method. Makes sumfile if necessary.
     '''Main class setup method. Makes sumfile if necessary.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile'))
         if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile']))
         if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile()
         self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile'))
         if os.path.exists(self.info['SumFile']) and not self.opt['Force']:
             if rje.yesNo('%s found. Use these results?' % self.info['SumFile']):
                 return self.printLog('#SUM','Summary results file found. No MASCOT processing.')
         mapgi = False
         ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for mfile in self.list['ResFiles']:
             bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile])
             bud.info['Name'] = mfile
             bud.readMascot()
             self.dict['Searches'][mfile] = bud.dict['Hits']
             protacclist = rje.sortKeys(bud.dict['Hits'])
             for protacc in protacclist:
                 if rje.matchExp('gi\|(\d+)',protacc): mapgi = True
             accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile))
             self.debug(accfile)
             open(accfile,'w').write(string.join(protacclist,'\n'))
             self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist)))
         ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         #if mapgi:
         #    mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI')
         #    open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n'))
         ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list)
         self.dict['Acc2Seq'] = seqlist.seqNameDic('Max')
         ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',')
         rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True)
         for mfile in rje.sortKeys(self.dict['Searches']):
             bud = self.dict['Searches'][mfile]
             for protacc in rje.sortKeys(bud)[0:]:
                 protname = bud[protacc]['prot_acc']
                 protdesc = bud[protacc]['prot_desc']
                 if rje.matchExp('gi\|(\d+)',protacc):
                     gi = rje.matchExp('gi\|(\d+)',protacc)[0]
                     try:
                         protname = self.dict['Acc2Seq'][gi].shortName()
                         protdesc = self.dict['Acc2Seq'][gi].info['Description']
                     except: protname = 'gi_UNK__%s' % gi
                 #x#print protname, protdesc, bud[protacc]
                 for pep in bud[protacc]['Peptides']:
                     data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname,
                             'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']}
                     rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data)
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
예제 #8
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         forkx = len(self.list['Forked'])
         self.setup()
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.forking()
         self.printLog('#FORK',
                       'Forking of %s jobs completed.' % (rje.iStr(forkx)))
     except:
         self.errorLog('Forker.run() Error')
     if self.list['Forked']:
         self.warnLog('%s fork jobs remain unforked.' %
                      rje.iLen(self.list['Forked']))
         return False
     return True
예제 #9
0
 def batchRun(self,returnobj=False):     ### Execute batch mode runs
     '''Execute batch mode runs.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         barg = self.getStrLC('BatchArg')
         if not barg: raise ValueError('Cannot use batchrun=FILELIST if batcharg=None.')
         batchfiles = self.list['BatchRun'][0:]
         self.list['BatchRun'] = []  # Avoid recursive running!
         blog = self.getStr('BatchLog')
         if not blog.startswith('.'): blog = '.%s' % blog
         if not blog.endswith('.log'): blog = '%s.log' % blog
         rawcmd = self.cmd_list[0:]
         rawlog = self.log
         batchobj = []
         ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         bx = 0
         for bfile in batchfiles:
             bx += 1
             self.printLog('#BATCH','Batch running %s of %s: %s=%s' % (rje.iStr(bx),rje.iLen(batchfiles),barg,bfile))
             ## Setup parameters
             bbase = rje.baseFile(bfile,strip_path=True)
             bcmd = ['%s=%s' % (barg,bfile)]
             if self.getBool('BatchBase'):
                 if blog == '.log': bcmd += ['basefile=%s' % bbase]
                 else: bcmd += ['basefile=%s%s' % (bbase,rje.baseFile(blog))]
             elif self.getStrLC('BatchLog'): bcmd += ['log=%s%s' % (bbase,blog)]
             else: bcmd += ['newlog=F']
             #self.debug(bcmd)
             ## Setup Seqsuite object
             self.cmd_list = rawcmd + bcmd
             self.log = rje.setLog(self.log.obj['Info'],self,self.cmd_list)                 # Sets up Log object for controlling log file output
             ## Run
             batchobj.append(self.run())
             ## Finish and Tidy
             self.log = rawlog
             runobj =  batchobj[-1]
             if runobj:
                 if not returnobj: batchobj[-1] = True
                 info = runobj.log.obj['Info']
                 self.printLog('#RUN','%s V%s run finished.' % (info.program,info.version))
             else: self.warnLog('Batch run failed (%s=%s).' % (barg,bfile))
         ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         failx = batchobj.count(False)
         self.printLog('#BATCH','%s batch runs complete: %s failed.' % (rje.iLen(batchfiles),rje.iStr(failx)))
         self.list['BatchRun'] = batchfiles
         return batchobj
     except: self.errorLog('%s.batchRun error' % self); return False
예제 #10
0
 def topTerms(self,slimx=20,parents=False,total='Total',countkey='counts'):  ### Selects top terms for GO slim set
     '''
     Selects top terms for GO slim set.
     >> slimx:int [20] = Desired min. number of terms for each GO domain.
     >> parents:bool [False] = Whether parents and children both allowed in list
     >> total:str ['Total'] = Sample containing Total counts for assessment
     >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample
     - self.go(id)[countkey] = {Sample:count}
     << returns a list of GO IDs that meet criteria
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         #x#self.opt['DeBug'] = True
         terms = []                          # List of terms
         dom = {'cc':{},'bp':{},'mf':{}}     # Dictionary of {domain:{count:[IDs]}}
         for id in self.go():
             n = self.go(id)[countkey][total]
             type = self.go(id)['type']
             if n not in dom[type]: dom[type][n] = [id]
             else: dom[type][n].append(id)
         ### ~ [2] ~ Generate Top Terms ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.deBug(dom)
         for type in dom:
             dterms = []                     # Terms for this domain only
             dkeys = rje.sortKeys(dom[type]) # Counts, low to high
             dkeys.reverse()                 # Counts, high to low
             (dx,dtot) = (0.0,len(dkeys))
             while dkeys and len(dterms) < slimx: # Keep looping
                 self.deBug('%s: %s' % (type,dterms))
                 self.progLog('#TOP','Generating top %d %s terms: %.1f%%' % (slimx,type,dx/dtot))
                 dx += 100.0
                 n = dkeys.pop(0)            # Remove from list
                 dterms += dom[type][n]      # Add terms to term list
                 if parents: continue        # Don't care if parents and children all mixed up
                 for id in dterms[0:]:
                     if id not in dterms: continue               # Previously-removed parent
                     for par in self.parents(id):                # Check all parents
                         if par in dterms: dterms.remove(par)    # Remove parent term
             self.printLog('\r#TOP','Identified %s top %s terms: >= %s genes' % (rje.iLen(dterms),type,rje.iStr(n)))
             terms += dterms                 # Found a stable list of terms
         self.deBug(terms)
         return terms
     except: self.errorLog('Major problem with GO.topTerms()')
     return []
예제 #11
0
 def classify(self): ### Generate summary tables for each protein class
     '''Generate summary tables for each protein class.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         rankdb = self.db('taxamap')
         for cfile in self.list['Classify']:
             pclass = rje.baseFile(cfile,strip_path=True)
             clist = []
             for fline in open(cfile,'r').readlines():
                 prot = string.split(rje.chomp(fline),maxsplit=1)[0]
                 if prot: clist.append(prot)
             self.printLog('#CLASS','%s "%s" class proteins read from %s' % (rje.iLen(clist),pclass,cfile))
             if not clist:
                 self.warnLog('No proteins read from %s' % (cfile))
                 continue
             classdb = db.copyTable(rankdb,pclass)
             classdb.dropEntriesDirect('protein',clist,inverse=True)
             if not classdb.entries():
                 self.warnLog('No "%s" proteins found in TaxaMap table' % (pclass))
                 continue
             self.summaryScores(classdb,pclass,'MinClass')
     except: self.errorLog('%s.classify() error' % self.prog())
예제 #12
0
 def seqSubset2(self):    ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),mainkeys=['Ingolia'],name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile,mainkeys=['Gene'],name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log,self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(['sourcedata=%s' % xfile,'aliases=%s' % afile])
             ing_genes = string.split(string.join(self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes: ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog('#MUSG','%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes),rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog('#MAP','Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene); ing_mgi.remove(gene)
             self.printLog('#BAD','Failed to map %s genes from Ignolia' % rje.iLen(bad_genes))
             open('ingolia.bad.txt','w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),'EnsEMBL',sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog('#MUSG','%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes),rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna','pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'seqout=%s' % seqout,'autofilter=T','autload=T','seqmode=file','gooddesc=%s' % string.join(ing_musg,',')]
                     rje_seqlist.SeqList(self.log,seqcmd)
             mdb = self.db().addEmptyTable('map',['Ingolia','Gene','EnsEMBL'],['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia':gene,'Gene':ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'autofilter=F','autload=T','seqmode=file']
         iseq = rje_seqlist.SeqList(self.log,seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST',evalue='')
             while iseq.nextSeq():
                 (iname,icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)',iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL',musg):
                     if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0]
                     else: entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]':'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST'); sdb.addField('ENSP'); sdb.addField('ENSI');
         ENST = open('IngExact.cdna.all.fa','w')
         ENSP = open('IngExact.pep.all.fa','w')
         ex = 0.0; etot = sdb.entryNum(); sx = 0; fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog('\r#ING','Mapping Ignolia Harrington Starts: %.2f%%' % (ex/etot)); ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']: fx += 1; continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'],','):
                 (tname,tseq) = iseq.getDictSeq(trans,format='tuple')
                 self.deBug('%s vs %s' % (tseq[entry[icod]-3:][:7],entry[icon]))
                 if tseq[entry[icod]-3:][:7] == entry[icon]:
                     ipept = string.split(rje_sequence.dna2prot(tseq[entry[icod]:]),'*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog('\r#ING','Unable to find Harrington start for %s %s (%s)' % (gene,entry[icod],entry[icon]),screen=False)
                 fx += 1; continue
             elif len(peptseq) < minpep:
                 self.printLog('\r#ING','Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene,entry[icod],entry[icon]),screen=False)
                 fx += 1; continue
             id = rje.preZero(int(ex/100),etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write('>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],entry['Gene'],mentry['Gene'],cdnaseq))
             ENSP.write('>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],id,entry['Gene'],mentry['Gene'],peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close(); ENSP.close()
         self.printLog('\r#ING','Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx),rje.iStr(fx)))
         return
     except: self.errorLog('%s.method error' % self)
예제 #13
0
 def powerGO(self,numbers,sig=0.01,samples='all',total='Total',countkey='counts',ignore=[]):  ### Special GO power calculation for GO slim set
     '''
     Special GO power calculation for GO slim set.
     >> numbers:dictionary of {Sample:Count}
     >> sig:float [0.01] = Desired significance level to achieve. Currently uncorrected. Add Bonf/FDR with time.
     >> samples:str ['all'] = Whether sig must be achievable for 'any' or 'all' samples.
     >> total:str ['Total'] = Sample containing Total counts to compare against
     >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample
     - self.go(id)[countkey] = {Sample:count}
     >> ignore:list of Samples to ignore from calculation
     << returns a list of GO IDs that meet criteria
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         N = numbers[total]        # Total count for calculating expectations/probabilities
         nlist = []                  # List of counts for subsamples to be assessed
         for sample in numbers:
             if sample not in ignore + [total]: nlist.append(numbers[sample])
         nlist = rje.sortUnique(nlist,xreplace=False,num=True)
         ### ~ [2] ~ Generate Power Range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         plist = []                  # List of acceptable Total counts for subset
         nx = 0.0
         for i in range(1,N+1):      # Look at all possible levels of occurrence
             self.progLog('#POW','Calculating GO term power: %.1f%%' % (nx/N))
             nx += 100.0
             ok = 0
             p = float(i) / N        # Probability of each gene having this term
             for n in nlist:         # Look at each subset
                 k1 = min(i,n)       # Want to look at largest possible count for sample-term pairing
                 k2 = max(0,n-(N-i)) # Also want to look at the likelihood of under-representation
                 if rje.binomial(k1,n,p,callobj=self) <= sig: ok += 1
                 elif (1 - rje.binomial(k2+1,n,p,callobj=self)) <= sig: ok += 1
                 #!# Add under-representation too! #!#
                 if ok and samples == 'any': break
             if (ok and samples == 'any') or ok == len(nlist): plist.append(i)
         self.printLog('\r#POW','Calculation of GO term power complete.',log=False)
         self.deBug(nlist)
         ### ~ [3] ~ Generate GO Slim ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         terms = []
         (ix,itot) = (0.0,len(self.go()))
         for id in rje.sortKeys(self.go()):
             self.progLog('#POW','Assessing terms for power: %.1f%% (%s terms)' % (ix/itot,rje.iLen(terms)))
             ix += 100.0
             if self.go(id)[countkey][total] in plist: terms.append(id)
         self.printLog('\r#POW','Assessed terms for statistical power, p <= %s: %s GO terms' % (sig,rje.iLen(terms)))
         #!# Add correction terms #!#
         self.deBug(terms)
         return terms
     except: self.errorLog('Major problem with GO.powerGO()')
     return []
예제 #14
0
 def batchRun(self, returnobj=False):  ### Execute batch mode runs
     '''Execute batch mode runs.'''
     try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         barg = self.getStrLC('BatchArg')
         if not barg:
             raise ValueError(
                 'Cannot use batchrun=FILELIST if batcharg=None.')
         batchfiles = self.list['BatchRun'][0:]
         self.list['BatchRun'] = []  # Avoid recursive running!
         blog = self.getStr('BatchLog')
         if not blog.startswith('.'): blog = '.%s' % blog
         if not blog.endswith('.log'): blog = '%s.log' % blog
         rawcmd = self.cmd_list[0:]
         rawlog = self.log
         batchobj = []
         ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         bx = 0
         for bfile in batchfiles:
             bx += 1
             self.printLog(
                 '#BATCH', 'Batch running %s of %s: %s=%s' %
                 (rje.iStr(bx), rje.iLen(batchfiles), barg, bfile))
             ## Setup parameters
             bbase = rje.baseFile(bfile, strip_path=True)
             bcmd = ['%s=%s' % (barg, bfile)]
             if self.getBool('BatchBase'):
                 if blog == '.log': bcmd += ['basefile=%s' % bbase]
                 else:
                     bcmd += ['basefile=%s%s' % (bbase, rje.baseFile(blog))]
             elif self.getStrLC('BatchLog'):
                 bcmd += ['log=%s%s' % (bbase, blog)]
             else:
                 bcmd += ['newlog=F']
             #self.debug(bcmd)
             ## Setup Seqsuite object
             self.cmd_list = rawcmd + bcmd
             self.log = rje.setLog(
                 self.log.obj['Info'], self, self.cmd_list
             )  # Sets up Log object for controlling log file output
             ## Run
             batchobj.append(self.run())
             ## Finish and Tidy
             self.log = rawlog
             runobj = batchobj[-1]
             if runobj:
                 if not returnobj: batchobj[-1] = True
                 info = runobj.log.obj['Info']
                 self.printLog(
                     '#RUN',
                     '%s V%s run finished.' % (info.program, info.version))
             else:
                 self.warnLog('Batch run failed (%s=%s).' % (barg, bfile))
         ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         failx = batchobj.count(False)
         self.printLog(
             '#BATCH', '%s batch runs complete: %s failed.' %
             (rje.iLen(batchfiles), rje.iStr(failx)))
         self.list['BatchRun'] = batchfiles
         return batchobj
     except:
         self.errorLog('%s.batchRun error' % self)
         return False
예제 #15
0
 def setup(self,force=False,parents=True):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getBool('Setup') and not force: self.printLog('#SETUP','Taxonomy setup already complete.'); return True
         if not self.setupSourceData(): raise IOError
         if not self.getStrLC('Basefile'):
             if self.getBool('BatchMode'): self.setBaseFile('batch')
             elif self.list['TaxIn']: self.setBaseFile(rje.baseFile(self.list['TaxIn'][0],strip_path=True))
         self.list['TaxOut'] = string.join(self.list['TaxOut']).lower().split()
         if 'all' in self.list['TaxOut']: self.list['TaxOut'] = ['taxid','spcode','name','common']
         self.list['RankID'] = []
         ### ~ [2] TaxMap Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxmap = self.dict['TaxMap'] = {}
         tx = 0; px = 0; fx = 0
         for tline in open(self.getStr('TaxMap'),'r').readlines():
             self.progLog('\r#TAXID','Reading %s: %s TaxID' % (self.getStr('TaxMap'),rje.iStr(tx)))
             #try: (child,parent,taxtype) = rje.matchExp('^(\d+)\s+\|\s+(\d+)\s+\|\s+(\S+)\s+',tline)
             try: (child,parent,taxtype) = string.split(tline,'\t|\t')[:3]
             except: fx += 1; self.debug(tline); continue
             self.dict['Rank'][child] = taxtype
             if parent not in taxmap: taxmap[parent] = []
             if not taxmap[parent]: px += 1
             if taxtype in self.list['RankTypes']: self.list['RankID'].append(child)
             if child not in taxmap: taxmap[child] = []
             taxmap[parent].append(child); tx += 1
             if child in self.dict['Parent']: self.warnLog('Child TaxID "%s" already has parent!' % child)
             if parents and child != parent: self.dict['Parent'][child] = parent
         self.printLog('\r#TAXID','%s TaxID (%s parent taxa) read from %s; %s failed.' % (rje.iStr(tx),rje.iStr(px),self.getStr('TaxMap'),rje.iStr(fx)))
         self.printLog('#SPEC','%s TaxID mapped to %s RankTypes' % (rje.iLen(self.list['RankID']),string.join(self.list['RankTypes'],'/')))
         if self.test():
             pcheck = 0
             for tax in taxmap:
                 if taxmap[tax]: pcheck += 1
             self.printLog('#TEST','%s parent taxa with listed children' % rje.iStr(pcheck))
             if px != pcheck: raise ValueError
         ### ~ [3] NameMap Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.getBool('MemSaver'):
             taxdict = self.dict['TaxDict']
             ## ~ [3a] SpecFile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             tx = 0; cx = 0; taxid = None
             for tline in open(self.getStr('SpecFile'),'r').readlines():
                 self.progLog('\r#SPEC','Reading %s species data: %s TaxID' % (self.getStr('SpecFile'),rje.iStr(tx)))
                 nmatch = rje.matchExp('^(\S+)\s+\S+\s+(\d+):\s+N=(\S.+)\s*$',tline)
                 if nmatch:
                     taxid = nmatch[1]; tx += 1
                     taxdict[taxid] = {'spcode': nmatch[0], 'name': nmatch[2]}
                 elif taxid and rje.matchExp('C=(\S.+)\s*$',tline): taxdict[taxid]['common'] = rje.matchExp('C=(\S.+)\s*$',tline)[0]; cx += 1
             self.printLog('\r#SPEC','%s species codes/names and %s common names read from %s.' % (rje.iStr(tx),rje.iStr(cx),self.getStr('SpecFile')))
             ## ~ [3b] NCBI names.dmp ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             tx = 0
             for tline in open(self.getStr('NameMap'),'r').readlines():
                 self.progLog('\r#SPEC','Reading %s species names: %s TaxID' % (self.getStr('NameMap'),rje.iStr(tx)))
                 tdata = string.split(tline,'\t|\t')
                 if not tdata[3].startswith('scientific name'): continue
                 taxid = tdata[0]
                 if taxid not in taxdict: taxdict[taxid] = {'name': tdata[1]}; tx += 1
             self.printLog('\r#SPEC','%s extra species names read from %s.' % (rje.iStr(tx),self.getStr('NameMap')))
         ### ~ [4] Species code table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         spfile = '%sspcode.%s.tdt' % (self.getStr('SourcePath'),self.getStr('SourceDate'))
         self.db().addTable(spfile,['Species'],name='SpCode',expect=False)
         ### ~ [5] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.setBool({'Setup':True})
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
예제 #16
0
    def inSilicoHybrid(
        self
    ):  ### Filter and combine subreads from parent and output to fasta file.
        '''
        Filter and combine subreads from parent and output to fasta file.

        This module generates balanced "in silico diploid" PacBio subread data from two sequenced haploid parents. Each
        parent must first be run through SMRTSCAPE to generate subread summary data. (This will be performed if missing. Each
        parent needs a `*.fofn` file of subread file names, `*.unique.tdt` unique subreads table and `*.smrt.tdt` SMRT cell
        identifier table.)

        A new set of subreads is then generated from the combined set of parent subreads. This is done by first ranking the
        unique subreads from each parent by length. First, the longest subread from each parent are compared and the shortest
        selected to be the first subread of the diploid. (The shortest is taken to minimise length differences between the
        two parents.) Next, the longest subread from the next parent that is no longer than the previous subread is added.
        This cycles, picking a read from the the parent with fewest cumulative bases each cycle. The longest subread that is
        no longer than the previous subread is selected. This continues until one parent runs out of subreads. Additional
        subreads will be added from the other parent if they reduce the difference in cumulative output for each parent.

        Final output will be a `*.subreads.fasta` file in which each parent has a similar total sequence content and for
        which the subread length distributions should also be similar. This is to overcome biases in resulting diploid
        assemblies, where one parent has higher quality data than the other.

        NOTE: If performing downstream filtering by Read Quality (RQ), this might reintroduce a bias if one parent has much
        higher RQ values than the other. The `rqfilter=X` setting can therefore be used to restrict output to  reads with a
        minimum RQ value. By default this is 0.84. If you do not get enough sequence output, this setting may need to be
        relaxed.
        '''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [0a] Parent 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 1 ~~~~~~~~~~~~~~~~~~~~ #')
            self.printLog('#FOFN', 'Parent1: %s' % self.getStr('Parent1'))
            base1 = rje.baseFile(self.getStr('Parent1'))
            parent1 = smrtscape.SMRTSCAPE(
                self.log, ['genomesize=13.1e6'] + self.cmd_list +
                ['batch=%s' % self.getStr('Parent1'),
                 'basefile=%s' % base1])
            parent1.setup()
            udb1 = parent1.udb()
            cdb = parent1.db('smrt', add=True, mainkeys=['Name'])
            cdb.dataFormat({'SMRT': 'int'})
            cx = cdb.entryNum()
            ## ~ [0a] Parent 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 2 ~~~~~~~~~~~~~~~~~~~~ #')
            self.printLog('#FOFN', 'Parent2: %s' % self.getStr('Parent2'))
            base2 = rje.baseFile(self.getStr('Parent2'))
            parent2 = smrtscape.SMRTSCAPE(
                self.log, ['genomesize=13.1e6'] + self.cmd_list +
                ['batch=%s' % self.getStr('Parent2'),
                 'basefile=%s' % base2])
            parent2.setup()
            udb2 = parent2.udb()
            cdb2 = parent2.db('smrt', add=True, mainkeys=['Name'])
            cdb2.dataFormat({'SMRT': 'int'})
            # Shift all of the Parent2 SMRT IDs to avoid conflict with Parent1
            for entry in cdb2.entries() + udb2.entries():
                entry['SMRT'] = entry['SMRT'] + cx
            cdb = parent1.db().mergeTables(cdb, cdb2)
            ## ~ [0c] Output Sequence File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ DIPLOIDOCUS SUBREADS ~~~~~~~~~~~~~~~~~~~~ #'
            )
            minlen = self.getInt('LenFilter')
            minrq = self.getNum('RQFilter')
            rqstr = '%s' % minrq
            filtfile = '%s.L%sRQ%s.fasta' % (self.baseFile(), minlen,
                                             rqstr[2:])
            ## ~ [0d] Input Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqbatch = []  # List of SeqList objects
            self.printLog(
                '#BATCH', '%s sequence files to process.' %
                rje.iLen(parent1.list['Batch'] + parent2.list['Batch']))
            for seqfile in parent1.list['Batch'] + parent2.list['Batch']:
                seqcmd = self.cmd_list + [
                    'seqmode=file', 'autoload=T', 'summarise=F',
                    'seqin=%s' % seqfile, 'autofilter=F'
                ]
                seqbatch.append(rje_seqlist.SeqList(self.log, seqcmd))
            self.printLog(
                '#BATCH',
                '%s sequence files to summarise.' % rje.iLen(seqbatch))
            if not seqbatch:
                raise IOError(
                    'No batch input fasta files found! Make sure parentN=FILE settings given *.fofn.'
                )
            ## ~ [0e] Setup subread lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            elists = [
                udb1.sortedEntries('Len', reverse=True),
                udb2.sortedEntries('Len', reverse=True)
            ]
            plen = [0, 0]  # Summed lengths for each parent
            pseq = [0, 0]  # Total sequence number for each parent
            prq = [0, 0]  # Total sequence RQ for each parent (convert to mean)
            if not elists[0] or not elists[1]:
                raise ValueError(
                    'No Unique ZMW subreads for one or both parents!')
            lastlen = max(elists[0][0]['Len'],
                          elists[1][0]['Len'])  # Length of last selected read
            for elist in elists:
                while elist and elist[0]['RQ'] < minrq:
                    elist.pop(0)
            if not elists[0] or not elists[1]:
                raise ValueError(
                    'No Unique ZMW subreads for one or both parents!')
            nextp = 0  # Index of next parent to use
            if elists[0][0]['Len'] < elists[1][0]['Len']: nextp = 1

            ### ~ [1] Filter and Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Filter Unique Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            zmwlist = []  # List of (smrt,zmw) meeting filtering criteria
            ux = 0.0
            utot = len(elists[0]) + len(elists[1])
            while lastlen:
                self.progLog('\r#DIP',
                             'Diploidising subreads: %.2f%%' % (ux / utot))
                elist = elists[nextp]
                while elist and elist[0]['RQ'] < minrq:
                    elist.pop(0)
                    ux += 100.0
                if elist and elist[0]['Len'] < minlen:
                    ux += 100.0 * len(elist)
                    elist = []
                if not elist:
                    nextp = 1 - nextp
                    break  # Finish
                entry = elist.pop(0)
                ux += 100.0
                zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos']))
                plen[nextp] += entry['Len']
                prq[nextp] += entry['RQ']
                pseq[nextp] += 1
                if plen[1 - nextp] <= plen[nextp]: nextp = 1 - nextp
                lastlen = entry['Len']
            ## ~ [1b] Final processing of last reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            while elists[nextp]:
                elist = elists[nextp]
                while elist and elist[0]['RQ'] < minrq:
                    self.progLog('\r#DIP',
                                 'Diploidising subreads: %.2f%%' % (ux / utot))
                    elist.pop(0)
                    ux += 100.0
                while elist and elist[0]['Len'] >= minlen:
                    self.progLog('\r#DIP',
                                 'Diploidising subreads: %.2f%%' % (ux / utot))
                    entry = elist.pop(0)
                    ux += 100.0
                    pdiff = rje.modulus(plen[0] - plen[1])
                    ediff = rje.modulus(plen[nextp] + entry['Len'] -
                                        plen[1 - nextp])
                    if ediff >= pdiff:
                        elists[nextp] = []
                        break  #Finish!
                    zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos']))
                    plen[nextp] += entry['Len']
                    prq[nextp] += entry['RQ']
                    pseq[nextp] += 1
            self.printLog(
                '\r#DIP',
                'Diploidising subreads complete: %s subreads to output.' %
                rje.iLen(zmwlist))
            self.printLog(
                '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' %
                (self.getStr('Parent1'), rje.iStr(pseq[0]), rje.iStr(plen[0]),
                 1.0 * plen[0] / self.getInt('GenomeSize'), prq[0] / pseq[0]))
            self.printLog(
                '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' %
                (self.getStr('Parent2'), rje.iStr(pseq[1]), rje.iStr(plen[1]),
                 1.0 * plen[1] / self.getInt('GenomeSize'), prq[1] / pseq[1]))
            ## ~ [1b] Extract Filtered Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            rje.backup(self, filtfile)
            SEQOUT = open(filtfile, 'w')
            sx = 0.0
            stot = 0
            sn = len(seqbatch)
            fx = 0
            for seqlist in seqbatch:
                #>m150625_001530_42272_c100792502550000001823157609091582_s1_p0/9/0_3967 RQ=0.784
                si = 100.0 / seqlist.seqNum()
                stot += seqlist.seqNum()
                for seq in seqlist.seqs():
                    self.progLog('\r#OUT',
                                 'Extracting subreads: %.2f%%' % (sx / sn))
                    sx += si
                    (name, sequence) = seqlist.getSeq(seq)
                    try:
                        [smrt, zmw, pos,
                         rq] = string.split(string.replace(name, '/', ' '))
                    except:
                        [smrt, zmw,
                         pos] = string.split(string.replace(name, '/', ' '))
                        rq = minrq
                    if (cdb.data(smrt)['SMRT'], int(zmw), pos) not in zmwlist:
                        continue
                    SEQOUT.write('>%s\n%s\n' % (name, sequence))
                    fx += 1
            self.printLog(
                '\r#OUT',
                'Saved %s filtered subreads to %s.' % (rje.iStr(fx), filtfile))

            ### ~ [2] Summarise Filtered File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seqcmd = self.cmd_list + [
                'seqmode=file', 'autoload=T', 'summarise=T',
                'seqin=%s' % filtfile, 'autofilter=F'
            ]
            rje_seqlist.SeqList(self.log, seqcmd)

            return True
        except:
            self.errorLog('%s.run error' % self.prog())
            return False
예제 #17
0
 def taxaChildren(self,taxid):   ### Extracts TaxID children from TaxMap file and updates RankID and TaxMap dicts.
     '''Extracts TaxID children from TaxMap file and updates RankID and TaxMap dicts.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         # NB. This is very slow and so reading the while.
         self.debug(taxid)
         taxmap = self.dict['TaxMap']
         if taxid in taxmap: return taxmap[taxid]
         ### ~ [1] Parse from TaxMap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxmap[taxid] = []
         for tline in os.popen('grep -e "\s%s\s" %s' % (taxid,self.getStr('TaxMap'))).readlines():
             try: (child,parent,taxtype) = rje.matchExp('^(\d+)\s+\|\s+(\d+)\s+\|\s+(\S+)\s+',tline)
             except: continue
             if parent not in taxmap: taxmap[parent] = []
             taxmap[parent].append(child)
             if taxtype in ['species','subspecies']: self.list['RankID'].append(child)
             self.progLog('\r#TAXID','Reading %s: %s TaxID' % (self.getStr('TaxMap'),rje.iLen(taxmap)))
         return taxmap[taxid]
     except: self.errorLog('%s.taxaChildren(%s) error' % (self,taxid)); raise
예제 #18
0
 def mapToTaxID(self,taxa,nodeonly=False,rankonly=False,log=True,warn=True):  ### Maps taxa onto TaxID. If taxa is a list, will process each element.
     '''Maps taxa onto TaxID. If taxa is a list, will process each element. Returns a list.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not taxa: return []
         taxid = []
         ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tlist = True
         try: taxa.sort()
         except: tlist = False
         if tlist:
             tx = 0.0; ttot = len(taxa)
             if ttot > 1:
                 for t in taxa:
                     if log: self.progLog('\r#TAXID','Mapping to TaxID: %.1f%%' % (tx/ttot)); tx += 100.0
                     taxid += self.mapToTaxID(t,nodeonly,rankonly,log=False)
                 taxid = rje.sortUnique(taxid)
                 if log:
                     if ttot > 1: self.printLog('\r#TAXID','Mapped %s taxa to %s TaxID' % (rje.iStr(ttot),rje.iLen(taxid)))
             else:
                 t = taxa[0]
                 if log: self.progLog('\r#TAXID','Mapping %s to TaxID...' % t)
                 taxid = rje.sortUnique(self.mapToTaxID(t,nodeonly,rankonly,log=False))
                 if log: self.printLog('\r#TAXID','Mapped %s to %s TaxID' % (t,rje.iLen(taxid)))
             return taxid
         ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxmap = self.dict['TaxMap']; rankid = self.list['RankID']
         taxa = '%s' % taxa
         ## ~ [2a] Taxa ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if rje.matchExp('^(\d+)$', taxa):
             #if taxa not in taxmap: self.taxaChildren(taxa)
             #if taxa in rankid: return [taxa]
             if nodeonly:
                 if taxa in rankid or not rankonly: return [taxa]
                 else: return []
             if taxa not in taxmap:
                 if warn: self.warnLog('Cannot find TaxID %s!' % taxa,'Missing_TaxID',suppress=True)
                 return []
             parents = [taxa]
             while parents:
                 taxa = parents.pop(0)
                 #if taxa not in taxmap: self.taxaChildren(taxa)
                 if not rankonly or taxa in rankid: taxid.append(taxa)
                 parents += taxmap[taxa]
             return taxid
         ## ~ [2b] Species Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if taxa == string.replace(taxa.upper(),' ',''):
             greplines = os.popen('grep "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines()
             for entry in greplines:
                 try: taxid.append(rje.matchExp('^%s\s+\S+\s+(\d+):' % taxa,entry)[0])
                 except: pass
             if not taxid and warn: self.warnLog('Cannot find Species Code "%s"!' % taxa,'Missing_SpCode',suppress=True)
             if len(taxid) > 1: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
             return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
         ### ~ [3] Species name etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxa = taxa.replace('_',' ')
         ## ~ [3a] Grep from Uniprot ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         greplines = os.popen('grep -B 2 -i "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines()
         gtaxid = None; comid = []; synid = []
         for entry in greplines:
             try: gtaxid = rje.matchExp('^\S+\s+\S+\s+(\d+):',entry)[0]
             except: pass
             if rje.matchExp('s=(%s)\s*$' % taxa.lower(),entry.lower()): synid.append(gtaxid)
             elif rje.matchExp('c=(%s)\s*$' % taxa.lower(),entry.lower()): comid.append(gtaxid)
             elif rje.matchExp('=(%s)\s*$' % taxa.lower(),entry.lower()): taxid.append(gtaxid)
         if not taxid: taxid = comid
         if not taxid: taxid = synid
         if not taxid and warn: self.warnLog('Cannot find Taxon name "%s" in Uniprot!' % taxa,'Missing Taxon',suppress=True)
         if len(taxid) > 1:
             #self.bugPrint(string.join(greplines))
             #self.debug('%s %s %s' % (taxid,comid,synid))
             if warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
         if taxid: return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
         #self.debug(taxid)
         ## ~ [3b] Grep from NCBI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         greplines = os.popen('grep -i -e "\t%s\t" %s' % (taxa, self.getStr('NameMap'))).readlines()
         for entry in greplines:
             try:
                 #gtaxid = rje.matchExp('^(\d+)\s+\S\s+(\S.+)$',entry)
                 gtaxid = string.split(entry,'\t|\t')
                 if gtaxid[1].lower() == taxa.lower(): taxid.append(gtaxid[0])
                 elif gtaxid[2] and gtaxid[2].lower() == taxa.lower(): taxid.append(gtaxid[0])
             except: pass
         if len(taxid) > 1 and warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|')))
         return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid
     except: self.errorLog('%s.mapToTaxID() error' % (self)); raise
예제 #19
0
    def parseMITAB(self):   ### Parse MITAB file into pairwise PPI table.
        '''Parse MITAB file into pairwise PPI table.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            xref = self.obj['XRef']
            pdb = self.db('pairwise')
            pfields = ['Hub','Spoke','HubUni','SpokeUni','HubTaxID','SpokeTaxID','Evidence','IType']
            headers = {}
            for h in range(len(self.list['Headers'])): headers[self.list['Headers'][h]] = h
            dbsource = self.getStr('DBSource')
            ### ~ [2] Read through MITAB ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mx = 0; ex = 0; fax = 0; ftx = 0; fx = 0; uhx = 0; usx = 0
            epos = self.endPos('MITAB')
            complexidlist = []
            badtaxa = ['-']
            baduni = []
            while 1:
                self.progLog('\r#MITAB','Parsing %s MITAB %s: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,self.fileProg('MITAB',epos),rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist)))
                mline = self.readDelimit('MITAB'); mx += 1
                if not mline: break
                entry = {'#':pdb.entryNum()}
                for field in pfields: entry[field] = ''
                ## ~ [2a] Add iRefIndex complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                complexid = {}     # This will take the first complex ID
                if 'irigid' in self.list['Headers'] and 'numParticipants' in self.list['Headers']:
                    if int(mline[headers['numParticipants']]) > 2:
                        complexid['A'] = complexid['B'] = 'rigid:%s' % mline[headers['irigid']]
                        #self.bugPrint(mline)
                        #self.debug(complexid)
                ## ~ [2b] Parse and check taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                taxa = {'A':'','B':''}
                for tfield in self.list['TaxaField']:
                    ab = tfield[-1:].upper()
                    if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',tfield.upper())[0]
                    try:
                        taxon = rje.matchExp('^taxid:(\d+)',mline[headers[tfield]].lower())[0]
                        if self.list['TaxID'] and taxon not in self.list['TaxID']: continue
                        taxa[ab] = taxon
                    except:
                        taxon = mline[headers[tfield]]
                        if taxon not in badtaxa:
                            badtaxa.append(taxon)
                            self.warnLog('No TaxID read from %s: "%s"' % (tfield,taxon),'no_tax',suppress=True)
                        if not self.list['TaxID']: taxa[ab] = '-'
                if not taxa['A'] and complexid: taxa['A'] = taxa['B']
                if not taxa['B'] and complexid: taxa['B'] = taxa['A']
                if not (taxa['A'] and taxa['B']): ftx += 1; continue
                ## ~ [2c] Parse protein IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ids = {'A':[],'B':[]}
                uni = {'A':'','B':''}
                for ifield in self.list['IDField']:
                    ab = ifield[-1:].upper()
                    if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',ifield.upper())[0]
                    # Split IDs on | then db:id vs self.list['MapDB']
                    for pid in string.split(mline[headers[ifield]],'|'):
                        try: (db,dbid) = string.split(pid,':',1)
                        except: continue
                        if db.lower() in ['uniprotkb'] and '(' in dbid: continue    # Only map uniprotkb accnum
                        dbid = string.split(dbid,'(')[0]
                        dbid = string.split(dbid,';')[0]
                        if db.lower() in ['uniprotkb']:
                            svid = dbid
                            dbid = string.split(svid,'-')[0]
                        if ab not in complexid:     # First identifier for A/B
                            if db.lower() in self.list['Complex']: complexid[ab] = pid; ids[ab].append(pid)
                            else: complexid[ab] = ''
                        if not self.list['MapDB'] or db.lower() in self.list['MapDB']: ids[ab].append(dbid)
                        # Parse uniprot directly if possible
                        if db.lower() in ['uniprotkb'] and not uni[ab]:
                            if self.getBool('SpliceVar'): uni[ab] = svid
                            else: uni[ab] = dbid
                #self.bugPrint(ids)
                ## ~ [2d] Map parsed IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                amb = {'A':False,'B':False}
                if not ids['A'] or not ids['B']:
                    #self.bugPrint('%s\n=> ID Failure' % mline)
                    #self.bugPrint(ids['A']); self.bugPrint(ids['B'])
                    #self.bugPrint(entry)
                    fx += 1; continue
                for ida in ids['A']:
                    #self.debug('%s => %s (or %s)' % (ida,xref.xref(ida,unique=True),xref.xref(ida,unique=False)))
                    if not entry['Hub']: entry['Hub'] = xref.xref(ida,unique=True,usedict=True)
                    if entry['Hub'] == False: amb['A'] = True
                    #if not entry['HubUni']: entry['HubUni'] = xref.xref(ida,self.getStr('UniField'),unique=True,usedict=True)
                    if not entry['HubUni']: entry['HubUni'] = self.getUniXRef(ida)
                if self.getBool('AddUni') and not entry['HubUni']:
                    entry['HubUni'] = uni['A']
                    if uni['A'] and uni['A'] not in baduni: baduni.append(uni['A'])
                if not entry['Hub'] and entry['HubUni']:
                    entry['Hub'] = entry['HubUni']
                    #self.warnLog('UniprotKB "%s" used for Hub' % entry['HubUni'],'unihub',suppress=True)
                    uhx += 1
                if not entry['Hub'] and complexid['A']:
                    entry['Hub'] = complexid['A']
                else: complexid['A'] = ''
                if self.getBool('UniOnly') and not complexid['A'] and not entry['HubUni']: entry['Hub'] = ''
                for idb in ids['B']:
                    if not entry['Spoke']: entry['Spoke'] = xref.xref(idb,unique=True,usedict=True)
                    if entry['Spoke'] == False: amb['B'] = True
                    #if not entry['SpokeUni']: entry['SpokeUni'] = xref.xref(idb,self.getStr('UniField'),unique=True,usedict=True)
                    if not entry['SpokeUni']: entry['SpokeUni'] = self.getUniXRef(idb)
                if self.getBool('AddUni') and not entry['SpokeUni']: entry['SpokeUni'] = uni['B']
                if not entry['Spoke'] and entry['SpokeUni']:
                    entry['Spoke'] = entry['SpokeUni']
                    #self.warnLog('UniprotKB "%s" used for Spoke' % entry['SpokeUni'],'unihub',suppress=True)
                    usx += 1
                if not entry['Spoke'] and complexid['B']:
                    entry['Spoke'] = complexid['B']
                else: complexid['B'] = ''
                if self.getBool('UniOnly') and not complexid['B'] and not entry['SpokeUni']:
                    entry['Spoke'] = ''
                    if uni['B'] and uni['B'] not in baduni: baduni.append(uni['B'])
                if complexid['A'] and complexid['B']:
                    if not (complexid['A'].startswith('rigid:') and complexid['B'].startswith('rigid:')):
                        self.printLog('\r#MITAB','',log=False)
                        self.warnLog('Cannot parse complex:complex PPI (%s & %s)' % (complexid['A'],complexid['B']),'complex-complex',suppress=True)
                    entry['Hub'] = entry['Spoke'] = ''
                #self.bugPrint(entry)
                #self.debug(complexid)
                if not (entry['Hub'] and entry['Spoke']):
                    if (entry['Hub'] or amb['A']) and (entry['Spoke'] or amb['B']):
                        fax += 1; continue
                    #self.bugPrint(mline); self.debug(entry)
                    fx += 1; continue
                #if self.dev() and 'PCNA' not in [entry['Hub'],entry['Spoke']]: continue
                entry['HubTaxID'] = taxa['A']
                entry['SpokeTaxID'] = taxa['B']
                if complexid['A'] and complexid['A'] not in complexidlist: complexidlist.append(complexid['A'])
                if complexid['B'] and complexid['B'] not in complexidlist: complexidlist.append(complexid['B'])
                #if complexid['A'] or complexid['B']: self.debug(entry)
                ## ~ [2c] Parse evidence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                #self.bugPrint(mline)
                evidence = []
                for tfield in self.list['MethodField']:
                    #self.bugPrint(string.split(mline[headers[tfield]],'|'))
                    for etype in string.split(mline[headers[tfield]],'|'):
                        ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype)
                        if ematch: evidence.append('%s:%s' % (dbsource,ematch[0]))
                if not evidence: evidence.append('%s:unknown' % (self.getStr('DBSource')))
                evidence = rje.sortUnique(evidence)
                #self.debug(evidence)
                entry['Evidence'] = string.join(evidence,'|')
                ## ~ [2d] Parse interaction types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                itypes = []
                for tfield in self.list['TypeField']:
                    #self.bugPrint(string.split(mline[headers[tfield]],'|'))
                    for etype in string.split(mline[headers[tfield]],'|'):
                        ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype)
                        if ematch: itypes.append(ematch[0])
                if not itypes: itypes.append('unknown')
                itypes = rje.sortUnique(itypes)
                #self.debug(itypes)
                entry['IType'] = string.join(itypes,'|')
                pdb.addEntry(entry); ex += 1
                if self.dev() and entry['Hub'] in ['KLF3']:#,'WDR5']:
                    self.printLog('#DEV',string.join(mline,'\t'))
                    #self.bugPrint(uni); self.debug(entry)
                if self.getBool('Symmetry') and not complexid['A'] and not complexid['B']:
                    pdb.addEntry({'#':pdb.entryNum(),'Hub':entry['Spoke'],'Spoke':entry['Hub'],
                                  'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'],
                                  'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'],
                                  'Evidence':entry['Evidence'],'IType':entry['IType']})
            self.printLog('\r#MITAB','Parsing %s MITAB complete: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist)))
            self.close('MITAB')
            if (uhx+usx): self.warnLog('UniprotKB IDs used for %s Hub and %s Spoke IDs.' % (rje.iStr(uhx),rje.iStr(usx)))
            if baduni:
                baduni.sort()
                accout = '%s.%s.unmapped.uniacc' % (self.baseFile(),dbsource)
                self.warnLog('%s unmapped UniprotKB IDs used: output to %s.' % (rje.iLen(baduni),accout))
                open(accout,'w').write(string.join(baduni,'\n'))

            ### ~ [3] Convert complexes to pairwise PPIs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not complexidlist: return pdb
            self.printLog('#CPLEX','%s complex IDs parsed to convert to pairwise PPI.' % rje.iLen(complexidlist))
            ## ~ [3a] Assemble complex memberships ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            complexes = {}; chentries = []; csentries = []
            cevidence = {}  # List of Evidence for each complex
            citypes = {}    # List of ITypes for each complex
            ctaxa = {}
            ex = 0.0; etot = pdb.entryNum()
            for entry in pdb.entries():
                self.progLog('\r#CPLEX','Assembling complexes: %.1f%%' % (ex/etot)); ex += 100.0
                if entry['Hub'] in complexidlist:
                    cid = entry['Hub']
                    if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = []
                    complexes[cid].append(entry['Spoke'])
                    ctaxa[entry['Spoke']] = entry['SpokeTaxID']
                    cevidence[cid].append(entry['Evidence'])
                    citypes[cid].append(entry['IType'])
                    chentries.append(entry)
                elif entry['Spoke'] in complexidlist:
                    cid = entry['Spoke']
                    if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = []
                    complexes[cid].append(entry['Hub'])
                    ctaxa[entry['Hub']] = entry['HubTaxID']
                    cevidence[cid].append(entry['Evidence'])
                    citypes[cid].append(entry['IType'])
                    csentries.append(entry)
            self.printLog('\r#CPLEX','Assembled %s of %s complexes.' % (rje.iLen(complexes),rje.iLen(complexidlist)))
            #self.debug(complexes)
            ## ~ [3b] Update complexes dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            cppi = {}
            ex = 0.0; etot = len(complexes); rx = 0; px = 0; cmax = 0
            for cid in rje.sortKeys(complexes):
                self.progLog('\r#CPLEX','Reducing complexes: %.1f%%' % (ex/etot)); ex += 100.0
                if self.dev(): self.printLog('#DEV','Complex %s: %s' % (cid,complexes[cid]))
                if len(complexes[cid]) < 2:
                    complexes.pop(cid)
                    cevidence.pop(cid)
                    citypes.pop(cid)
                    rx += 1; continue
                complexes[cid].sort()
                #cevidence[cid] = string.join(rje.sortUnique(cevidence[cid]),'|')
                #citypes[cid] = string.join(rje.sortUnique(citypes[cid]),'|')
                cmax = max(cmax,len(complexes[cid]))
                #px += (len(complexes[cid]) * (len(complexes[cid])-1))
                members = complexes[cid][0:]
                while members:
                    hub = members.pop(0)
                    if self.dev() and hub == 'KLF3': self.debug(cid)
                    if hub not in cppi: cppi[hub] = {}
                    for spoke in members:
                        if spoke not in cppi[hub]:
                            cppi[hub][spoke] = []; px += 1
                            cppi[hub][spoke].append(cid)
            self.printLog('\r#CPLEX','Reduced %s complexes to %s > 1 member: %s ppi to add.' % (rje.iStr(etot),rje.iLen(complexes),rje.iStr(px)))
            ## ~ [3c] Update pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            cix = pdb.entryNum()
            for centry in chentries + csentries: pdb.dropEntry(centry)
            ex = 0.0; etot = len(cppi)
            for hub in rje.sortKeys(cppi):
                self.progLog('\r#CPLEX','Expanding complexes: %.1f%%' % (ex/etot)); ex += 100.0
                #hentry = {'Hub':hub,'HubUni':xref.xref(hub,self.getStr('UniField'),unique=True,usedict=True),'HubTaxID':ctaxa[hub]}
                hentry = {'Hub':hub,'HubUni':self.getUniXRef(hub),'HubTaxID':ctaxa[hub]}
                for spoke in rje.sortKeys(cppi[hub]):
                    evidence = []
                    itypes = []
                    ctypes = []
                    for cid in cppi[hub][spoke]:
                        evidence += cevidence[cid]
                        itypes += citypes[cid]
                        ctypes += string.split(cid,':')[0]
                    ctype = string.join(rje.sortUnique(ctypes),'|')
                    evidence = string.join(rje.sortUnique(evidence),'|')
                    if not evidence: evidence = '%s:%s' % (dbsource,ctype)
                    itypes = string.join(rje.sortUnique(itypes),'|')
                    if not itypes: itypes = ctype
                    #newentry = {'#':cix,'Spoke':spoke,'SpokeUni':xref.xref(spoke,self.getStr('UniField'),unique=True,usedict=True),'SpokeTaxID':ctaxa[spoke]}
                    newentry = {'#':cix,'Spoke':spoke,'SpokeUni':self.getUniXRef(spoke),'SpokeTaxID':ctaxa[spoke]}
                    newentry['Evidence'] = evidence
                    newentry['IType'] = itypes
                    entry = pdb.addEntry(rje.combineDict(newentry,hentry,overwrite=False)); cix += 1
                    if self.dev() and entry['Hub'] in ['KLF3','WDR5']: self.debug('Complex: %s' % entry)
                    if self.getBool('Symmetry'):
                        pdb.addEntry({'#':cix,'Hub':entry['Spoke'],'Spoke':entry['Hub'],
                                      'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'],
                                      'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'],
                                      'Evidence':entry['Evidence'],'IType':entry['IType']})
                        cix += 1
            self.printLog('#CPLEX','%s complex IDs expanded to pairwise PPI => %s ppi (symmetry=%s).' % (rje.iLen(complexidlist),rje.iStr(pdb.entryNum()),self.getBool('Symmetry')))
            return pdb
        except: self.errorLog('%s.parseMITAB error' % self.prog())
예제 #20
0
 def _positiveAndNegativePeptides(self): ### Populates PosPep and NegPep Lists
     '''Populates PosPep and NegPep Lists.'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pfile = '%s.peptides.tdt' % self.basefile()
         #if rje.exists(pfile) and not self.getBool('Force'):
         #    try:
         #        pdb = self.db().addTable(pfile,['Peptide'],name='Peptides')
         #        pdb.dataFormat(reformat={'Len':'int','MWt':'num','Cys':'int','Ser':'int','Hyd':'num'})
         #        self.list['Peptides'] = self.list['PosPep'] = pdb.index('Pos')['Y']
         #        self.list['NegPep'] = pdb.index('Positive')['Neg']
         #        return pdb
         #    except: pass
         if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False
         self.list['Peptides'] = peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True)
         seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T'])
         pdb = self.db().addEmptyTable('Peptides',['Peptide','NR','Pos','Len','MWt','C','HPW','DENQ','M','Hyd'],['Peptide'])
         ### ~ [1] ~ Digest Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         protease = self.getStr('PepCut')
         self.list['PosPep'] = poslist = []; self.list['NegPep'] = neglist = []; sx = 0.0; stot = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             frag = string.split(sequence,':')
             while '' in frag: frag.remove('')
             if not self.getBool('NTerm'): frag = frag[1:]
             for pep in frag[0:]:
                 if pep not in poslist: poslist.append(pep)
         self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist)))
         ## ~ [1b] ~ Peptide Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         allpep = []; self.list['Redundant'] = redundant = []
         sx = 0.0; stot = self.obj['SeqList'].seqNum() 
         for seq in self.obj['SeqList'].seqs():
             self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]:
                 sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             for frag in string.split(sequence,':'):
                 if frag in allpep: redundant.append(frag)
                 else: allpep.append(frag)
         self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (protease,rje.iStr(stot)))   
         ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         px = 0.0; ptot = len(poslist)
         for pep in poslist[0:]:
             self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0
             entry = {'Peptide':pep,'MWt':rje_sequence.MWt(pep),'Hyd':rje_sequence.eisenbergHydropathy(pep,returnlist=False),
                      'Len':len(pep),'NR':'Y','Pos':'Y'}
             if pep not in peplist: poslist.remove(pep); neglist.append(pep); entry['Pos'] = 'N'
             if pep in redundant: entry['NR'] = 'N'
             for aacomb in ['C','HPW','DENQ','M']:
                 x = 0
                 for a in aacomb: x += pep.count(a)
                 entry[aacomb] = x
             pdb.addEntry(entry)
         self.printLog('\r#PEP','Processing positive peptides (%s) complete: %s Pos; %s Neg.' % (protease,rje.iLen(poslist),rje.iLen(neglist)))
         ### ~ [2] ~ Save Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pdb.saveToFile(pfile)
         POS = open('%s.positives.fas' % self.basefile(),'w'); NEG = open('%s.negatives.fas' % self.basefile(),'w')
         for pep in poslist: POS.write('>%s\n%s\n' % (pep,pep))
         for pep in neglist: NEG.write('>%s\n%s\n' % (pep,pep))
         POS.close(); self.printLog('#FAS','%s peptides output to %s.positives.fas' % (rje.iLen(poslist),self.basefile()))
         NEG.close(); self.printLog('#FAS','%s peptides output to %s.negatives.fas' % (rje.iLen(neglist),self.basefile()))
         return pdb
     except: self.errorLog('Problem during %s._positiveAndNegativePeptides().' % self); return None  # Setup failed
예제 #21
0
    def runXML(self):  ### Generic method
        '''
        Generic method. Add description here (and arguments.)
        '''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datatype = self.getStr('DataType').lower()
            exps = {}  # Experiment alias: run directory list
            runs = {}  # Run alias: file list
            run2run = {}  # Convert runs to run aliases

            ## ~ [1a] Get Files and Directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            dirlist = rje.listDir(self,
                                  subfolders=False,
                                  folders=True,
                                  files=False,
                                  summary=True)
            filelist = rje.listDir(self,
                                   folder=os.getcwd(),
                                   subfolders=True,
                                   folders=False,
                                   files=True,
                                   summary=True)

            current = os.getcwd()
            curlen = len(current) + 1
            for pathlist in [dirlist, filelist]:
                for i in range(len(pathlist)):
                    path = pathlist[i]
                    if path.startswith(current): pathlist[i] = path[curlen:]
                    else: raise ValueError(path)

            ## ~ [1b] Clean up files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            dx = len(dirlist)
            self.debug(filelist)
            self.debug(dirlist)
            self.debug(self.list['DirList'])
            if self.list['DirList']:
                dirlist = rje.listIntersect(dirlist, self.list['DirList'])
            self.printLog('#DIR',
                          'Process %d of %d directories' % (len(dirlist), dx))
            keepext = []
            if datatype == 'pacbio': keepext = ['.h5', '.xml']
            for filename in filelist[0:]:
                ext = os.path.splitext(filename)[1]
                if len(string.split(
                        filename,
                        os.sep)) < 2 or ext not in keepext or string.split(
                            filename, os.sep)[0] not in dirlist:
                    filelist.remove(filename)
            self.printLog(
                '#FILES', '%s files kept from %s directories' %
                (rje.iLen(filelist), rje.iLen(dirlist)))
            self.debug(filelist[:10])
            self.debug(filelist[-10:])

            ### ~ [2] Parse runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if datatype == 'pacbio':
                for filename in filelist[0:]:
                    self.printLog('#FILE', filename)
                    filedata = string.split(filename, os.sep)
                    parent = filedata[
                        0]  # This should be a directory containing runs
                    experiment = filedata[1]
                    expalias = string.join(
                        string.split(experiment, '.')[:2], '.')
                    run = string.join(filedata[1:3], os.sep)
                    if expalias not in exps: exps[expalias] = []
                    if run not in exps[expalias]: exps[expalias].append(run)
                    runalias = '%s-%d' % (expalias, len(exps[expalias]))
                    run2run[run] = runalias
                    runfile = filedata[-1]
                    if runalias not in runs: runs[runalias] = []
                    runs[runalias].append(filename)
                    self.printLog(
                        '#PARSE', '%s - %s: (%d) %s' %
                        (expalias, runalias, len(runs[runalias]), filename))

            ### ~ [3] Generate XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [3a] Experiment XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            efile = '%s.exp.xml' % self.baseFile()
            elines = [
                '<?xml version="1.0" encoding="UTF-8"?>',
                '<EXPERIMENT_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.experiment.xsd">'
            ]
            for experiment in rje.sortKeys(exps):
                ex = experiment[-1:]
                elines += [
                    '    <EXPERIMENT alias="%s" center_name="">' % experiment,
                    '        <TITLE>Cane toad whole genome sequencing - PacBio library %s</TITLE>'
                    % ex, '        <STUDY_REF accession="ERP106543"/>',
                    '        <DESIGN>', '            <DESIGN_DESCRIPTION/>',
                    '            <SAMPLE_DESCRIPTOR accession="ERS2169570"/>',
                    '            <LIBRARY_DESCRIPTOR>',
                    '                <LIBRARY_NAME>%s</LIBRARY_NAME>' %
                    experiment,
                    '                <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>',
                    '                <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>',
                    '                <LIBRARY_SELECTION>size fractionation</LIBRARY_SELECTION>',
                    '                <LIBRARY_LAYOUT>',
                    '                    <SINGLE/>',
                    '                </LIBRARY_LAYOUT>',
                    '                <LIBRARY_CONSTRUCTION_PROTOCOL></LIBRARY_CONSTRUCTION_PROTOCOL>',
                    '               </LIBRARY_DESCRIPTOR>',
                    '        </DESIGN>', '        <PLATFORM>',
                    '            <PACBIO_SMRT>',
                    '                <INSTRUMENT_MODEL>PacBio RS II</INSTRUMENT_MODEL>',
                    '            </PACBIO_SMRT>', '        </PLATFORM>',
                    '        <EXPERIMENT_ATTRIBUTES>',
                    '            <EXPERIMENT_ATTRIBUTE>',
                    '                <TAG>Size selection</TAG>',
                    '                <VALUE>15-50 kb</VALUE>',
                    '            </EXPERIMENT_ATTRIBUTE>',
                    '            <EXPERIMENT_ATTRIBUTE>',
                    '                <TAG>Sequencing Chemistry</TAG>',
                    '                <VALUE>P6C4</VALUE>',
                    '            </EXPERIMENT_ATTRIBUTE>',
                    '        </EXPERIMENT_ATTRIBUTES>', '    </EXPERIMENT>'
                ]
            elines += ['</EXPERIMENT_SET>']
            open(efile, 'w').write(string.join(elines, '\n'))
            self.printLog('#EXP', 'Experiment data saved to %s' % efile)
            ## ~ [3b] Run XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            rfile = '%s.run.xml' % self.baseFile()
            rlines = [
                '<?xml version="1.0" encoding="UTF-8"?>',
                '<RUN_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.run.xsd">'
            ]
            for experiment in rje.sortKeys(exps):
                for run in exps[experiment]:
                    runalias = run2run[run]
                    rlines += [
                        '   <RUN alias="%s" center_name="">' % runalias,
                        '    <EXPERIMENT_REF refname="%s"/>' % experiment,
                        '      <DATA_BLOCK>', '        <FILES>'
                    ]
                    for filename in runs[runalias]:
                        rlines += [
                            '             <FILE filename="%s" filetype="PacBio_HDF5">'
                            % filename, '             </FILE>'
                        ]
                    rlines += [
                        '        </FILES>', '      </DATA_BLOCK>', '  </RUN>'
                    ]
            rlines += ['</RUN_SET>']
            open(rfile, 'w').write(string.join(rlines, '\n'))
            self.printLog('#RUN', 'Run data saved to %s' % rfile)
            ## ~ [3c] Submission XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            xfile = '%s.xml' % self.baseFile()
            xlines = [
                '<?xml version="1.0" encoding="UTF-8"?>',
                '<SUBMISSION alias="%s" center_name="">' % self.baseFile(),
                '   <ACTIONS>', '      <ACTION>',
                '         <ADD source="%s" schema="experiment"/>' % efile,
                '      </ACTION>', '      <ACTION>',
                '         <ADD source="%s" schema="run"/>' % rfile,
                '      </ACTION>', '   </ACTIONS>', '</SUBMISSION>'
            ]
            open(xfile, 'w').write(string.join(xlines, '\n'))
            self.printLog('#SUBXML', 'Submission XML saved to %s' % xfile)
            return
        except:
            self.errorLog('%s.method error' % self.prog())
예제 #22
0
    def runXML(self):      ### Generic method
        '''
        Generic method. Add description here (and arguments.)
        '''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datatype = self.getStr('DataType').lower()
            exps = {}        # Experiment alias: run directory list
            runs = {}        # Run alias: file list
            run2run = {}    # Convert runs to run aliases

            ## ~ [1a] Get Files and Directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            dirlist = rje.listDir(self,subfolders=False,folders=True,files=False,summary=True)
            filelist = rje.listDir(self,folder=os.getcwd(),subfolders=True,folders=False,files=True,summary=True)

            current = os.getcwd()
            curlen = len(current) + 1
            for pathlist in [dirlist,filelist]:
                for i in range(len(pathlist)):
                    path = pathlist[i]
                    if path.startswith(current): pathlist[i] = path[curlen:]
                    else: raise ValueError(path)

            ## ~ [1b] Clean up files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            dx = len(dirlist)
            self.debug(filelist)
            self.debug(dirlist)
            self.debug(self.list['DirList'])
            if self.list['DirList']:
                dirlist = rje.listIntersect(dirlist,self.list['DirList'])
            self.printLog('#DIR','Process %d of %d directories' % (len(dirlist),dx))
            keepext = []
            if datatype == 'pacbio': keepext = ['.h5','.xml']
            for filename in filelist[0:]:
                ext = os.path.splitext(filename)[1]
                if len(string.split(filename,os.sep)) < 2 or ext not in keepext or string.split(filename,os.sep)[0] not in dirlist:
                    filelist.remove(filename)
            self.printLog('#FILES','%s files kept from %s directories' % (rje.iLen(filelist),rje.iLen(dirlist)))
            self.debug(filelist[:10])
            self.debug(filelist[-10:])

            ### ~ [2] Parse runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if datatype == 'pacbio':
                for filename in filelist[0:]:
                    self.printLog('#FILE',filename)
                    filedata = string.split(filename,os.sep)
                    parent = filedata[0]    # This should be a directory containing runs
                    experiment = filedata[1]
                    expalias = string.join(string.split(experiment,'.')[:2],'.')
                    run = string.join(filedata[1:3],os.sep)
                    if expalias not in exps: exps[expalias] = []
                    if run not in exps[expalias]: exps[expalias].append(run)
                    runalias = '%s-%d' % (expalias,len(exps[expalias]))
                    run2run[run] = runalias
                    runfile = filedata[-1]
                    if runalias not in runs: runs[runalias] = []
                    runs[runalias].append(filename)
                    self.printLog('#PARSE','%s - %s: (%d) %s' % (expalias,runalias,len(runs[runalias]),filename))

            ### ~ [3] Generate XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [3a] Experiment XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            efile = '%s.exp.xml' % self.baseFile()
            elines = ['<?xml version="1.0" encoding="UTF-8"?>','<EXPERIMENT_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.experiment.xsd">']
            for experiment in rje.sortKeys(exps):
                ex = experiment[-1:]
                elines += ['    <EXPERIMENT alias="%s" center_name="">' % experiment,
                            '        <TITLE>Cane toad whole genome sequencing - PacBio library %s</TITLE>' % ex,
                            '        <STUDY_REF accession="ERP106543"/>',
                            '        <DESIGN>',
                            '            <DESIGN_DESCRIPTION/>',
                            '            <SAMPLE_DESCRIPTOR accession="ERS2169570"/>',
                '            <LIBRARY_DESCRIPTOR>',
                '                <LIBRARY_NAME>%s</LIBRARY_NAME>' % experiment,
                '                <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>',
                '                <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>',
                '                <LIBRARY_SELECTION>size fractionation</LIBRARY_SELECTION>',
                '                <LIBRARY_LAYOUT>',
                '                    <SINGLE/>',
                '                </LIBRARY_LAYOUT>',
                '                <LIBRARY_CONSTRUCTION_PROTOCOL></LIBRARY_CONSTRUCTION_PROTOCOL>',
                '               </LIBRARY_DESCRIPTOR>',
                '        </DESIGN>',
                '        <PLATFORM>',
                '            <PACBIO_SMRT>',
                '                <INSTRUMENT_MODEL>PacBio RS II</INSTRUMENT_MODEL>',
                '            </PACBIO_SMRT>',
                '        </PLATFORM>',
                '        <EXPERIMENT_ATTRIBUTES>',
                '            <EXPERIMENT_ATTRIBUTE>',
                '                <TAG>Size selection</TAG>',
                '                <VALUE>15-50 kb</VALUE>',
                '            </EXPERIMENT_ATTRIBUTE>',
                '            <EXPERIMENT_ATTRIBUTE>',
                '                <TAG>Sequencing Chemistry</TAG>',
                '                <VALUE>P6C4</VALUE>',
                '            </EXPERIMENT_ATTRIBUTE>',
                '        </EXPERIMENT_ATTRIBUTES>',
                '    </EXPERIMENT>']
            elines += ['</EXPERIMENT_SET>']
            open(efile,'w').write(string.join(elines,'\n'))
            self.printLog('#EXP','Experiment data saved to %s' % efile)
            ## ~ [3b] Run XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            rfile = '%s.run.xml' % self.baseFile()
            rlines = ['<?xml version="1.0" encoding="UTF-8"?>','<RUN_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.run.xsd">']
            for experiment in rje.sortKeys(exps):
                for run in exps[experiment]:
                    runalias = run2run[run]
                    rlines += ['   <RUN alias="%s" center_name="">' % runalias,'    <EXPERIMENT_REF refname="%s"/>' % experiment,
                            '      <DATA_BLOCK>','        <FILES>']
                    for filename in runs[runalias]:
                        rlines += ['             <FILE filename="%s" filetype="PacBio_HDF5">' % filename,'             </FILE>']
                    rlines += ['        </FILES>','      </DATA_BLOCK>','  </RUN>']
            rlines += ['</RUN_SET>']
            open(rfile,'w').write(string.join(rlines,'\n'))
            self.printLog('#RUN','Run data saved to %s' % rfile)
            ## ~ [3c] Submission XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            xfile = '%s.xml' % self.baseFile()
            xlines = ['<?xml version="1.0" encoding="UTF-8"?>','<SUBMISSION alias="%s" center_name="">' % self.baseFile(),
                        '   <ACTIONS>','      <ACTION>','         <ADD source="%s" schema="experiment"/>' % efile,
                        '      </ACTION>','      <ACTION>','         <ADD source="%s" schema="run"/>' % rfile,
                        '      </ACTION>','   </ACTIONS>','</SUBMISSION>']
            open(xfile,'w').write(string.join(xlines,'\n'))
            self.printLog('#SUBXML','Submission XML saved to %s' % xfile)
            return
        except: self.errorLog('%s.method error' % self.prog())
예제 #23
0
 def seqSubset2(
     self
 ):  ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),
                                      mainkeys=['Ingolia'],
                                      name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile, mainkeys=['Gene'], name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(
                 ['sourcedata=%s' % xfile,
                  'aliases=%s' % afile])
             ing_genes = string.split(
                 string.join(
                     self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes:
                 ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog(
                         '#MAP',
                         'Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene)
                     ing_mgi.remove(gene)
             self.printLog(
                 '#BAD', 'Failed to map %s genes from Ignolia' %
                 rje.iLen(bad_genes))
             open('ingolia.bad.txt', 'w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),
                                     'EnsEMBL',
                                     sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna', 'pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + [
                         'seqin=%s' % seqfile,
                         'seqout=%s' % seqout, 'autofilter=T', 'autload=T',
                         'seqmode=file',
                         'gooddesc=%s' % string.join(ing_musg, ',')
                     ]
                     rje_seqlist.SeqList(self.log, seqcmd)
             mdb = self.db().addEmptyTable('map',
                                           ['Ingolia', 'Gene', 'EnsEMBL'],
                                           ['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia': gene, 'Gene': ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else:
                     entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + [
             'seqin=%s' % seqfile, 'autofilter=F', 'autload=T',
             'seqmode=file'
         ]
         iseq = rje_seqlist.SeqList(self.log, seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST', evalue='')
             while iseq.nextSeq():
                 (iname, icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)', iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL', musg):
                     if entry['ENST']:
                         entry['ENST'] += ',%s' % string.split(iname)[0]
                     else:
                         entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]': 'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST')
         sdb.addField('ENSP')
         sdb.addField('ENSI')
         ENST = open('IngExact.cdna.all.fa', 'w')
         ENSP = open('IngExact.pep.all.fa', 'w')
         ex = 0.0
         etot = sdb.entryNum()
         sx = 0
         fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog(
                 '\r#ING',
                 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot))
             ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']:
                 fx += 1
                 continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'], ','):
                 (tname, tseq) = iseq.getDictSeq(trans, format='tuple')
                 self.deBug('%s vs %s' %
                            (tseq[entry[icod] - 3:][:7], entry[icon]))
                 if tseq[entry[icod] - 3:][:7] == entry[icon]:
                     ipept = string.split(
                         rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog(
                     '\r#ING',
                     'Unable to find Harrington start for %s %s (%s)' %
                     (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             elif len(peptseq) < minpep:
                 self.printLog(
                     '\r#ING',
                     'Peptide from mapped Harrington start for %s %s (%s) too short!'
                     % (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             id = rje.preZero(int(ex / 100), etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write(
                 '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'],
                    entry['Gene'], mentry['Gene'], cdnaseq))
             ENSP.write(
                 '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id,
                    entry['Gene'], mentry['Gene'], peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close()
         ENSP.close()
         self.printLog(
             '\r#ING',
             'Output %s Ingolia peptides and transcripts. %s failed.' %
             (rje.iStr(sx), rje.iStr(fx)))
         return
     except:
         self.errorLog('%s.method error' % self)
예제 #24
0
    def peptAlign(self,regex=None,peptides=[],peptdis=None,termini=None,save=False): ### Align peptides using regular expression
        '''
        Align peptides using regular expression.
        >> regex:str [None] = Regular expression to use for alignment of peptides.
        >> peptides:list [] = List of peptides to align using regex.
        >> peptdis:str [None] = Peptide distance method to use first.
        >> termini:bool [None] = Whether peptides for alignment have termini (^ & $) or X flanking regex match.
        >> save:bool [True] = Whether to save to
        '''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not self.obj['PeptDis']: self.setup()
            failx = 0       # Number of failures
            ## ~ [0a] Setup method attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not peptides: peptides = self.list['Peptides']
            if not regex:
                if self.getStrLC('PeptAlign'): regex = self.getStrUC('PeptAlign')
                else: return peptides[0:]
            if termini == None: termini = self.getBool('Termini')
            if not peptdis: peptdis = self.getStrLC('PeptDis')
            ## ~ [0b] Setup SLiM and alignment attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if regex in ['T','TRUE']:   # SLiM-free alignment
                maxlen = 0; maxgapx = 0
                slimvar = {}    # Dictionary of {gapx:gap pos}
                for pept in peptides: maxlen = max(maxlen,len(pept))
                for pept in peptides: maxgapx = max(maxgapx,maxlen-len(pept))
                maxgapx = min(maxgapx,self.getInt('MaxGapX'))
                for gapx in range(1,maxgapx+1):
                    slimvar[gapx] = []
                    peptlen = maxlen - gapx
                    if termini: gapcombos = rje.listCombos([range(peptlen)[1:-1]] * gapx,maxrep=self.getInt('MaxGapVar'))
                    else: gapcombos = rje.listCombos([range(peptlen)] * gapx,maxrep=self.getInt('MaxGapVar'))
                    for gapvar in gapcombos[0:]:
                        gapvar.sort(); gapvar.reverse()
                        if gapvar not in slimvar[gapx]: slimvar[gapx].append(gapvar)
                    self.printLog('#GAPX','PeptLen: %d; MaxLen: %d; Termini: %s => %s x %d gap variants.' % (peptlen,maxlen,termini,rje.iLen(slimvar[gapx]),gapx))
                #slimvar = range(maxlen)
                #if termini: slimvar = slimvar[1:-1]     # All possible positions for gaps. Generate combos as needed.
            else:
                #!# Need to deal with multiple regex?! (Use one with most matches and only keep that one?!)
                if rje_slim.needToSplitPattern(regex):
                    splits = rje_slim.splitPattern(regex)
                    self.printLog('#SPLIT','%s => %s' % (regex,string.join(splits,' | ')))
                    newregex = ''; bestpep = []
                    for regsplit in splits:
                        regexpep = []
                        for pept in peptides:
                            if termini and rje.matchExp('(%s)' % regsplit,pept[1:-1]): regexpep.append(pept)
                            elif not termini and rje.matchExp('(%s)' % regsplit,pept): regexpep.append(pept)
                        if len(regexpep) > len(bestpep): bestpep = regexpep[0:]; newregex = regsplit
                    self.printLog('#REGEX','%s => %s (%d/%d peptides)' % (regex,newregex,len(bestpep),len(peptides)))
                    regex = newregex
                    for pept in peptides[0:]:
                        if pept not in bestpep: self.warnLog('%s does not match %s!' % (pept,regex)); peptides.remove(pept); failx += 1
                slim = rje_slim.slimFromPattern(regex)
                self.printLog('#GUIDE','SLiM Guide: %s' % slim)
                slimpos = string.split(slim,'-')
                maxlen = rje_slim.slimLen(slim)
                if regex.startswith('^'): maxlen -= 1
                if regex.endswith('$'): maxlen -= 1
                slimvar = []    # Make variants of SLiMs (wildcard spacers only)
                w = 1
                while len(slimpos) > w: slimvar.append(slimpos[w]); w += 2  # Add wildvar spacers only
                maxvar = []
                for var in slimvar: maxvar.append(var[-1])   # Smallest number of wildcards: used to assess - to add
                slimvar = rje.listCombos(slimvar)   # Returns all possible combinations, used for building variants
                if termini: maxlen += 2
            ## ~ [0c] Setup Peptide Distance methods ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            dismethods = ['id','prop','pam']
            if peptdis:
                try: dismethods.insert(0,dismethods.pop(dismethods.index(peptdis)))
                except: self.warnLog('PeptDis method "%s" not recognised.' % peptdis)

            ### ~ [1] Cycle through peptides and make variants ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            variants = {}   # Dictionary of {peptide:[variants]}
            singletons = [] # Peptides with single variants
            for pept in peptides[0:]:
                self.progLog('\r#VAR','%s peptides: %s singletons; %s with possible variants.' % (rje.iLen(peptides),rje.iLen(singletons),rje.iLen(variants)))
                variants[pept] = []
                ## ~ [1a] ~ Make list of peptide length variants adding - at all possible positions ~ ##
                if regex in ['T','TRUE']:   # SLiM-free alignment
                    gapx = maxlen - len(pept)
                    if gapx > self.getInt('MaxGapX'):
                        self.warnLog('Peptide %s exceeds MapGapX=%d; rejected.' % (pept,self.getInt('MaxGapX')))
                        peptides.remove(pept)
                        continue
                    self.bugPrint(slimvar)
                    self.debug('%s: %s vs %s = %d' % (pept,len(pept),maxlen,gapx))
                    if gapx:    # Try all gap combinations
                        for gapvar in slimvar[gapx]:    #rje.listCombos([slimvar] * gapx):
                            peptvar = pept
                            for gap in gapvar: peptvar = peptvar[:gap] + '-' + peptvar[gap:]
                            if peptvar not in variants[pept]: variants[pept].append(peptvar)
                    else: variants[pept] = [pept]
                ## ~ [1b] ~ Make list of peptide length variants, adding - to regex wildvar positions ##
                else:
                    for var in slimvar:
                        peptvar = ''    # Add new variant
                        i = 0
                        if termini: peptvar += pept[i]; i += 1
                        if regex[0] != '^': peptvar += pept[i]; i += 1
                        for wi in range(len(var)):
                            wy = int(maxvar[wi]); wx = int(var[wi])
                            if wx: peptvar += pept[i:i+wx]; i += wx
                            peptvar += '-' * (wy - wx)  # Add a number of gaps equal to maxvar for same position minus slimvar
                            if i >= len(pept): break
                            peptvar += pept[i]; i += 1
                        # Keep variants that match regex and maxlen
                        if termini:
                            if regex[-1] != '$':
                                if i < len(pept): peptvar += pept[i]
                                i += 1
                            rmatch = rje.matchExp('(%s)' % regex,peptvar[1:-1])
                            self.bugPrint('%s vs %s: %s' % (regex,peptvar[1:-1],rmatch))
                            self.debug('%s vs %s and %s vs %s' % (i,len(pept),len(peptvar),maxlen))
                            keepvar = i == len(pept) and rmatch and len(peptvar) == maxlen
                        else: keepvar = i == len(pept) and rje.matchExp('(%s)' % regex,peptvar) and len(peptvar) == maxlen
                        self.bugPrint('%s %s: %s x %s => %s = %s' % (regex,slim,var,pept,peptvar,keepvar))
                        if keepvar and peptvar not in variants[pept]: variants[pept].append(peptvar)
                ## ~ [1c] ~ Check Peptide variants ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if len(variants[pept]) == 1: singletons.append(variants.pop(pept)[0])
                elif not variants[pept]: self.warnLog('No %s variants match %s!' % (pept,regex)); variants.pop(pept); failx += 1
            self.printLog('#VAR','%s peptides: %s singletons; %s with possible variants.' % (rje.iLen(peptides),rje.iLen(singletons),rje.iLen(variants)))
            self.debug(singletons)
            self.debug(variants)

            ### ~ [2] ~ Sort peptides by increasing numbers of variants ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            # For 2+ variants, rank by mean PeptDist versus single variants
            # Keep best (including ties) and cycle
            # Iterate until no more variants filtered
            # If variants remain, switch score method and iterate again
            # If variants remain after all score methods, keep first variant
            peptdis = dismethods.pop(0)
            while variants:
                self.progLog('\r#VAR','%s peptide tidy: %s singletons; %s with variants.' % (rje.iLen(peptides),rje.iLen(singletons),rje.iLen(variants)))
                if self.obj['PeptDis']: self.obj['PeptDis'].dict['Matrix'] = {}
                comppept = singletons[0:]
                prevarx = 0; postvarx = 0
                if not comppept: comppept = rje.listJoin(variants.values(),sortunique=True)
                for pept in rje.sortKeys(variants):
                    self.progLog('\r#VAR','%s peptide tidy: %s singletons; %s with variants.' % (rje.iLen(peptides),rje.iLen(singletons),rje.iLen(variants)))
                    scores = {}; prevarx += len(variants[pept])
                    for peptvar in variants[pept]:
                        dis = 0.0
                        for pep2 in comppept:
                            if termini: dis += self.peptDist(peptvar[1:-1],pep2[1:-1],peptdis)
                            else: dis += self.peptDist(peptvar,pep2,peptdis)
                        dis /= len(comppept)
                        if dis not in scores: scores[dis] = []
                        scores[dis].append(peptvar)
                    variants[pept] = scores.pop(rje.sortKeys(scores)[0])    # Keep lowest scoring variant(s)
                    if len(variants[pept]) == 1: singletons.append(variants.pop(pept)[0])
                    else: postvarx += len(variants[pept])
                self.printLog('#PDIS','%s distances: %s => %s variants.' % (peptdis,prevarx,postvarx))
                if prevarx == postvarx:
                    if dismethods: peptdis = dismethods.pop(0)
                    else: break
            self.printLog('#VAR','%s peptides tidied: %s singletons; %s with variants.' % (rje.iLen(peptides),rje.iLen(singletons),rje.iLen(variants)))
            if variants:
                self.warnLog('Unable to select all variants using distances.')
                self.printLog('#VAR','Arbitrary variants picked for %s peptides' % rje.iLen(variants))
                for pept in rje.sortKeys(variants): singletons.append(variants.pop(pept)[0])

            ### ~ [3] Remove 100% gapped positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for i in range(maxlen-1,-1,-1):
                degap = True
                for pept in singletons:
                    if pept[i] != '-': degap = False; break
                if degap:
                    for p in range(len(singletons)): singletons[p] = singletons[p][:i] + singletons[p][i+1:]

            ### ~ [4] Save and/or return peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if save:
                open('%s.aligned.txt' % self.baseFile(),'w').write(string.join(singletons,'\n'))
                self.printLog('#OUT','%s aligned peptides output to %s.aligned.txt' % (rje.iLen(singletons),self.baseFile()))
            return singletons

        except: self.errorLog('%s.peptAlign error' % self); raise
예제 #25
0
 def forking(
     self
 ):  ### Keeps forking out and processing jobs until no more jobs in self.list['Forked'].
     '''Keeps forking out and processing jobs until no more jobs in self.list['Forked'].'''
     ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     if self.getBool('PIDCheck') or self.dev():
         pidcheck = '%s.pid' % rje.baseFile(
             self.log.info['LogFile'])  # Set *.pid object to match log
     else:
         pidcheck = None
     #self.deBug(pidcheck)
     ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     while self.list['Forked']:
         if not self.getBool('LogFork'):
             self.progLog(
                 '\r#FORK', 'Forking jobs: {0} running; {1} remain.'.format(
                     len(self.list['Forked']),
                     rje.iLen(self.list['ToFork'])))
         if pidcheck: PIDCHECK = open(pidcheck, 'w')
         for fdict in self.list['Forked'][0:]:
             try:
                 pid = fdict['PID']
                 if pidcheck:
                     PIDCHECK.write('%s: %s\n' %
                                    (self.list['Forked'].index(fdict), pid))
                 if string.split('%s' % pid)[0] == 'WAIT': status = 1
                 else: (status, exit_stat) = os.waitpid(pid, os.WNOHANG)
             except:
                 self.errorLog('!')
                 status = 1
             if status > 0:
                 self.list['Forked'].remove(fdict)
                 self.endFork(
                     fdict
                 )  # Fork has finished: can replace with processing
         if pidcheck:
             PIDCHECK.close()
             #self.deBug(open(pidcheck,'r').read())
         ## ~ [2a] Look for eternal hanging of threads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if time.time() - self.getNum('KillTime') > self.getNum(
                 'KillForks'):
             self.verbose(
                 0, 1,
                 '\n%d seconds of main thread inactivity. %d forks still active!'
                 % (self.getNum('KillForks'), len(self.list['Forked'])), 1)
             for fdict in self.list['Forked']:
                 self.verbose(
                     0, 2, ' => Fork %s, PID %d still Active!' %
                     (fdict['ID'], fdict['PID']), 1)
             if (self.i() < 0 and self.getBool('KillMain')) or rje.yesNo(
                     'Kill Main Thread?',
                     default={
                         True: 'N',
                         False: 'Y'
                     }[self.getBool('KillMain')]):
                 raise ValueError(
                     '%d seconds of main thread inactivity. %d forks still active!'
                     % (self.getNum('KillForks'), len(self.list['Forked'])))
             elif self.i() < 0 or rje.yesNo('Kill hanging forks?'):
                 self.printLog(
                     '#KILL', 'KillForks=%d seconds walltime reached.' %
                     (self.getNum('KillForks')))
                 for fdict in self.list['Forked']:
                     self.printLog(
                         '#KILL', 'Killing Fork %s, PID %d.' %
                         (fdict['ID'], fdict['PID']))
                     os.system('kill %d' % fdict['PID'])
             else:
                 self.setNum({'KillTime': time.time()})
         ## ~ [2b] Sleep ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         time.sleep(self.getNum('ForkSleep'))
예제 #26
0
 def _peptideProbabilities(self):    ### Read in peptides and positives and calculate probability of return
     '''Read in peptides and positives and calculate probability of return.'''
     try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getBool('CysWeight'): return self._cysteinePeptideProbabilities()
         self._positiveAndNegativePeptides()
         #return self.printLog('#NOPROB','Probability calculation temporarily suspended')
         pfile = '%s.pep_prob.tdt' % self.basefile()
         if rje.exists(pfile) and not self.getBool('Force'):
             try:
                 pdb = self.db().addTable(pfile,['PepSize'],name='PepProb')
                 pdb.dataFormat(reformat={'PepSize':'num','Positive':'int','Negative':'int','Prob':'num'})
                 for entry in pdb.entries():
                     if entry['PepSize'] < 100: entry['PepSize'] = int(entry['PepSize'])
                 return pdb
             except: pass
         pdb = self.db().addEmptyTable('PepProb',['PepSize','Positive','Negative','Prob'],['PepSize'])
         if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False
         ## ~ [0a] ~ Load Peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True)
         ## ~ [0b] ~ Load Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T'])
         ### ~ [1] ~ Digest Positives and Update PepProb Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         protease = self.getStr('PepCut')
         ## ~ [1a] ~ Create new database entry to fill with data ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         edict = {}
         for i in range(1,self.getInt('MaxPepLen')+1):
             edict[i] = pdb.addEntry({'PepSize':i,'Positive':0,'Negative':0,'Prob':1.0})
             if self.getBool('PepMWt'): edict[i*100.0] = pdb.addEntry({'PepSize':i*100.0,'Positive':0,'Negative':0,'Prob':1.0})
         ## ~ [1b] ~ For each recognition site of each protease, mark cuts with ":" ~~~~~~~~ ##
         poslist = []; neglist = []; sx = 0.0; stot = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0
             sequence = seq.getSequence()
             for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut)
             frag = string.split(sequence,':')
             while '' in frag: frag.remove('')
             if not self.getBool('NTerm'): frag = frag[1:]
             for pep in frag[0:]:
                 if self.getBool('NRPep') and pep in self.list['Redundant']: continue
                 if pep not in poslist: poslist.append(pep)
         self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist)))
         ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         px = 0.0; ptot = len(poslist)
         for pep in poslist[0:]:
             self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0
             plen = min(len(pep),self.getInt('MaxPepLen'))
             if pep in peplist: edict[plen]['Positive'] += 1
             else: edict[plen]['Negative'] += 1; poslist.remove(pep); neglist.append(pep)
             if self.getBool('PepMWt'):
                 pwt = 100.0 * min(int((rje_sequence.MWt(pep)+99)/100.0),self.getInt('MaxPepLen'))
                 if pep in peplist: edict[pwt]['Positive'] += 1
                 else: edict[pwt]['Negative'] += 1
         self.printLog('\r#PEP','Processing positive peptides (%s) complete.' % protease)
         ## ~ [1d] # Calculate peptide probabilities for protease combo ~~~~~~~~~~~~~~~~~~~~ ##
         for entry in edict.values():
             try: entry['Prob'] = float(entry['Positive']) / float(entry['Positive']+entry['Negative'])
             except: entry['Prob'] = 0.0
         ### ~ [2] ~ Save File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         pdb.saveToFile(pfile)
         return pdb
     except: self.errorLog('Problem during %s._peptideProbabilities().' % self); return None  # Setup failed
예제 #27
0
 def pileUpStats(self):  ### Calculates statistics of genetic differences from parsed PileUp Tables
     '''Calculates statistics of genetic differences from parsed PileUp Tables.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         statfile = '%s.pdiff.tdt' % self.baseFile()
         if not self.force() and os.path.exists(statfile): return self.pileUpFDR()
         ## ~ [0a] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         wtdata = {}     # Load lists of data for compiling
         for locus in self.dict['RefSeq']:
             wtdata[locus] = {}
             for field in ['N','QN','MajFreq']: wtdata[locus][field] = []
         WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 1
         fields = []
         for line in WTDATA:
             data = rje.readDelimit(line)
             if fields:
                 locus = data[0]
                 pos = int(data[1])
                 while pos > wx:
                     wtdata[locus]['N'].append(0); wtdata[locus]['QN'].append(0); wtdata[locus]['MajFreq'].append(0.0); wx += 1
                 for field in ['N','QN']: wtdata[locus][field].append(int(data[fields.index(field)]))
                 for field in ['MajFreq']: wtdata[locus][field].append(string.atof(data[fields.index(field)]))
                 wx += 1
             else: fields = data[0:]
         WTDATA.close()
         ## ~ [0b] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         mutdata = {}     # Load lists of data for compiling
         for locus in self.dict['RefSeq']:
             mutdata[locus] = {}
             for field in ['N','QN','Major','MajFreq','WTFreq']: mutdata[locus][field] = []
         MUTDATA = open('%s.Mut.tdt' % self.baseFile(),'r'); mx = 1
         fields = []
         for line in MUTDATA:
             data = rje.readDelimit(line)
             if fields:
                 locus = data[0]
                 self.str['RefSeq'] = self.dict['RefSeq'][locus]
                 pos = int(data[1])
                 try:
                     if pos > len(self.str['RefSeq']):
                         while (pos-1) > len(self.str['RefSeq']): self.str['RefSeq'] += '?'
                         self.str['RefSeq'] += data[2]
                         self.dict['RefSeq'][locus] = self.str['RefSeq']
                     elif self.str['RefSeq'][pos-1] == '?':
                         self.str['RefSeq'] = self.str['RefSeq'][:pos-1] + data[2] + self.str['RefSeq'][pos:]
                         self.dict['RefSeq'][locus] = self.str['RefSeq']
                 except: self.warnLog('Problem mapping Pos %s onto %snt %s RefSeq' % (rje.iStr(pos),locus,rje.iLen(self.str['RefSeq'])))
                 while pos > mx:
                     mutdata[locus]['N'].append(0); mutdata[locus]['QN'].append(0); mutdata[locus]['Major'].append('-'); mutdata[locus]['MajFreq'].append(0.0); mutdata[locus]['WTFreq'].append(0.0); mx += 1
                 for field in ['N','QN']: mutdata[locus][field].append(int(data[fields.index(field)]))
                 for field in ['MajFreq','WTFreq']: mutdata[locus][field].append(string.atof(data[fields.index(field)]))
                 for field in ['Major']: mutdata[locus][field].append(data[fields.index(field)])
                 mx += 1
             else: fields = data[0:]
         MUTDATA.close()
         ## ~ [0c] Integrity check ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         #!# Need a new check with locus info #!#
         #for field in wtdata:    #!# Won't be true - not all reference genome positions present in output (0 mapped reads)
         #    if len(wtdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for WT %s' % field,printerror=False); raise ValueError
         #for field in mutdata:    #!# Won't be true - not all reference genome positions present in output (0 mapped reads)
         #    if len(mutdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for Mutant %s' % field,printerror=False); raise ValueError
         #self.printLog('#REF','WT and Mutant data for %s reference positions' % rje.iLen(self.str['RefSeq']))
         ### ~ [1] Assess and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'w')
         headers = ['Locus','Pos','Ref','WT.N','WT.QN','WT.Major','WT.MajFreq','Mut.N','Mut.QN','Mut.Major','Mut.MajFreq','Mut.WTFreq','p.Over','p.Under','p.Diff']
         SAMSIG.write('%s\n' % string.join(headers,'\t'))
         nodifx = 0; nomutx = 0; sx = 0
         for locus in rje.sortKeys(self.dict['RefSeq']):
             self.str['RefSeq'] = self.dict['RefSeq'][locus]
             self.list['WTMajor'] = self.dict['WTMajor'][locus]
             for i in range(len(self.str['RefSeq'])):
                 try:
                     sigdata = [locus,i+1,self.str['RefSeq'][i],wtdata[locus]['N'][i],wtdata[locus]['QN'][i],self.list['WTMajor'][i],wtdata[locus]['MajFreq'][i],
                                mutdata[locus]['N'][i],mutdata[locus]['QN'][i],mutdata[locus]['Major'][i],mutdata[locus]['MajFreq'][i],mutdata[locus]['WTFreq'][i]]
                 except: self.warnLog('Incomplete data for %s:%s (no pdiff output)' % (locus,rje.iStr(i+1))); continue
                 if self.getBool('MajDif') and self.list['WTMajor'][i] == mutdata[locus]['Major'][i]: nodifx += 1; continue   # Was: sigdata += [1.0,1.0]
                 elif self.getBool('MajMut') and self.str['RefSeq'][i] == mutdata[locus]['Major'][i]: nomutx += 1;continue
                 elif not wtdata[locus]['MajFreq'][i]:    # No Data for WT
                     if mutdata[locus]['WTFreq'][i]: sigdata += [0.0,1.0]
                     else: sigdata += [1.0,1.0]
                 elif mutdata[locus]['WTFreq'][i] > wtdata[locus]['MajFreq'][i]:
                     obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5)
                     sigdata.append(rje.binomial(obs,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self))
                     sigdata.append(1.0)
                 elif mutdata[locus]['WTFreq'][i] < wtdata[locus]['MajFreq'][i]:
                     obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5)
                     sigdata.append(1.0)
                     sigdata.append(1.0 - rje.binomial(obs+1,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self))
                 else: sigdata += [1.0,1.0]
                 sigdata.append(min(1.0,2*min(sigdata[-2:])))
                 rje.writeDelimit(SAMSIG,sigdata); sx += 1
         SAMSIG.close()
         ptxt = '%s lines output to *.pdiff.txt' % rje.iStr(sx)
         if self.getBool('MajDif'): ptxt += '; %s positions skipped where WTMajor==MutMajor (majdif=T)' % rje.iStr(nodifx)
         if self.getBool('MajMut'): ptxt += '; %s positions skipped where Ref==MutMajor (majmut=T)' % rje.iStr(nomutx)
         self.printLog('#PDIFF','%s.' % ptxt)
         ### ~ [2] FDR Correction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.pileUpFDR()
     except: self.errorLog('%s.pileUpStats() error' % (self)); return None