def mapTaxa(self,taxin,taxout=['spcode'],nodeonly=False,rankonly=False,savetaxout=True): ### Takes a list of Taxa and returns mapped Taxa data ''' Takes a list of Taxa and returns mapped Taxa data. >> taxin:str or list of taxon identifiers to map from. >> taxout:str or list of taxa output formats >> nodeonly:bool = whether to limit TaxID mapping to the precise matching nodes (else include children) >> rankonly:bool = whether to limit TaxID to those matching self.list['RankTypes'] taxon types. >> savetaxout:bool [True] = Whether to save the TaxOut list to a text file << taxoutlist:list of mapped taxa if taxout is a string, OR << taxoutdict:dict of mapped taxa if taxout is a list ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tlist = True try: taxout.sort() except: tlist = False if tlist: if not taxout: return {} taxout = [taxout] elif not taxout: return [] ### ~ [2] ~ Map to TaxID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxid = self.mapToTaxID(self.list['TaxIn'],nodeonly,rankonly) if self.list['RestrictID']: tx = len(taxid) taxid = rje.listIntersect(taxid,self.list['RestrictID']) self.printLog('#TAXID','%s of %s TaxID in %s Restricted IDs.' % (rje.iLen(taxid),rje.iStr(tx),rje.iLen(self.list['RestrictID']))) ### ~ [3] ~ Map TaxID and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxdict = {}; taxoutdict = {} for taxout in self.list['TaxOut']: taxout = taxout.lower() if taxout == 'taxid': taxoutlist = taxid elif taxout in ['spcode','name','common']: if not taxdict: taxdict = self.taxDict(taxid) taxoutlist = [] for t in taxid: try: taxoutlist.append(taxdict[t][taxout]) except: self.warnLog('No "%s" data for TaxID %s' % (taxout, t),'Missing_%s' % taxout,suppress=True) taxoutlist.sort() else: self.errorLog('TaxOut format "%s" not recognised' % taxout,printerror=False); continue taxoutdict[taxout] = taxoutlist if savetaxout: if not taxoutlist: self.printLog('#OUT','No %s IDs to output' % taxout); continue tfile = '%s.%s.txt' % (self.baseFile(),taxout) rje.backup(self,tfile) open(tfile,'w').write(string.join(taxoutlist,'\n')) self.printLog('#OUT','%s %s IDs output to %s.' % (rje.iLen(taxoutlist), taxout, tfile)) if tlist: return taxoutdict return taxoutlist except: self.errorLog('Problem during %s mapTaxa.' % self); raise
def batchSummarise(self): ### Batch run seqlist summarise on batchrun=LIST files and output table of results ''' Batch run seqlist summarise on batchrun=LIST files and output table of results ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.list['BatchRun']: raise ValueError('Need to provide batchrun=LIST files for summarise mode.') db = rje_db.Database(self.log,self.cmd_list) self.printLog('#BASE',db.baseFile()) sdb = None if not self.force(): sdb = db.addTable(mainkeys=['File'],name='summarise',expect=False) if not sdb: sdb = db.addEmptyTable('summarise',['File'],['File']) ### ~ [2] Run Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#BATCH','Batch summarising %s input files' % rje.iLen(self.list['BatchRun'])) for file in self.list['BatchRun']: seqdata = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=%s' % file,'autoload=T','summarise=F']).summarise() if seqdata: if 'GC' in seqdata: seqdata.pop('GC') seqdata['GCPC'] = '%.2f' % seqdata['GCPC'] if 'GapLength' in seqdata: seqdata['GapPC'] = '%.2f' % (100.0*seqdata['GapLength']/seqdata['TotLength']) seqdata['MeanLength'] = '%.1f' % seqdata['MeanLength'] for field in string.split('SeqNum, TotLength, MinLength, MaxLength, MeanLength, MedLength, N50Length, L50Count, GapLength, GapPC, GCPC',', '): if field in seqdata and field not in sdb.fields(): sdb.addField(field) for field in seqdata.keys(): if field not in sdb.fields(): sdb.addField(field) sdb.addEntry(seqdata) else: self.errorLog('Summarise failed for %s' % file,printerror=False) ### ~ [3] Output Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb.saveToFile() return True except: self.errorLog('%s.batchSummarise error' % self); return False
def classify(self): ### Generate summary tables for each protein class '''Generate summary tables for each protein class.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() rankdb = self.db('taxamap') for cfile in self.list['Classify']: pclass = rje.baseFile(cfile, strip_path=True) clist = [] for fline in open(cfile, 'r').readlines(): prot = string.split(rje.chomp(fline), maxsplit=1)[0] if prot: clist.append(prot) self.printLog( '#CLASS', '%s "%s" class proteins read from %s' % (rje.iLen(clist), pclass, cfile)) if not clist: self.warnLog('No proteins read from %s' % (cfile)) continue classdb = db.copyTable(rankdb, pclass) classdb.dropEntriesDirect('protein', clist, inverse=True) if not classdb.entries(): self.warnLog('No "%s" proteins found in TaxaMap table' % (pclass)) continue self.summaryScores(classdb, pclass, 'MinClass') except: self.errorLog('%s.classify() error' % self.prog())
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup Database ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log,self.cmd_list) db = self.db().addEmptyTable('ProDigIS',['AccNum','Protease','PepCount'],['AccNum','Protease']) if self.getInt('MinPepLen') > 0: db.addField('MinPepLen') if self.getBool('NRPep'): db.addField('NRPep') if rje.exists(self.getStr('Source')): fdb = self.db().addTable(self.getStr('Source'),mainkeys=['AccNum'],name='Source') fdb.addField('File') fdb.addField('ProtMWt') else: fdb = self.db().addEmptyTable('Source',['AccNum','File','ProtMWt'],['AccNum']) for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i) if self.getBool('PepMWt'): for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i*100.0) ### ~ [2] Load Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autoload=F']) self.obj['SeqList'].seq = fullseq = [] for seqfile in self.list['SeqFiles']: file = rje.baseFile(seqfile,True) seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % seqfile,'autoload=T']) fullseq += seqlist.seqs() for seq in seqlist.seqs(): accnum = seq.getStr('AccNum') try: entry = fdb.data()[accnum] if 'File' in entry and entry['File']: self.errorLog('%s found in %s AND %s!' % (accnum,entry['File'],file),printerror=False) entry['File'] = file entry['ProtMWt'] = seq.MWt() except: entry = {'AccNum':accnum,'File':file,'ProtMWt':seq.MWt()} fdb.addEntry(entry) self.deBug(fdb.dict['Data'][seq.getStr('AccNum')]) self.printLog('#SEQ','%s sequences to analyse in total' % rje.iLen(fullseq)) fdb.fillBlanks() ### ~ [3] Setup Peptide Probabilities ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self._peptideProbabilities(): db.addField('LenExp','PepCount'); if self.getBool('PepMWt'): db.addField('MWtExp','LenExp'); db.addField('Len7Exp','MWtExp') else: db.addField('Len7Exp','LenExp') db.addField('Len37','Len7Exp') if self.getBool('PepMWt'): db.addField('Len5','MWtExp'); db.addField('MWt5','Len5') db.addField('Len3','MWtExp'); db.addField('MWt3','Len3') else: db.addField('Len5','LenExp'); db.addField('Len3','LenExp') return ### ~ [4] Temp GABLAM Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdb = self.db().addTable('Chlam_Pos.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GABLAM') ndb = self.db().addTable('Chlam_Neg.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GNeg') self.db().mergeTables(gdb,ndb,overwrite=True,matchfields=True) gdb.renameField('Qry','AccNum') tmp = self.db().joinTables(name='blast',join=[('Source','AccNum'),('GABLAM','AccNum')],newkey=['AccNum','File'],keeptable=False) tmp.saveToFile() tmp.compress(['File'],default='mean') tmp.dropFields(['AccNum']) tmp.info['Name'] = 'blastsum' tmp.saveToFile() except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def batchSummarise( self ): ### Batch run seqlist summarise on batchrun=LIST files and output table of results ''' Batch run seqlist summarise on batchrun=LIST files and output table of results ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.list['BatchRun']: raise ValueError( 'Need to provide batchrun=LIST files for summarise mode.') db = rje_db.Database(self.log, self.cmd_list) self.printLog('#BASE', db.baseFile()) sdb = None if not self.force(): sdb = db.addTable(mainkeys=['File'], name='summarise', expect=False) if not sdb: sdb = db.addEmptyTable('summarise', ['File'], ['File']) ### ~ [2] Run Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog( '#BATCH', 'Batch summarising %s input files' % rje.iLen(self.list['BatchRun'])) for file in self.list['BatchRun']: seqdata = rje_seqlist.SeqList( self.log, self.cmd_list + ['seqin=%s' % file, 'autoload=T', 'summarise=F' ]).summarise() if seqdata: if 'GC' in seqdata: seqdata.pop('GC') seqdata['GCPC'] = '%.2f' % seqdata['GCPC'] if 'GapLength' in seqdata: seqdata['GapPC'] = '%.2f' % (100.0 * seqdata['GapLength'] / seqdata['TotLength']) seqdata['MeanLength'] = '%.1f' % seqdata['MeanLength'] for field in string.split( 'SeqNum, TotLength, MinLength, MaxLength, MeanLength, MedLength, N50Length, L50Count, GapLength, GapPC, GCPC', ', '): if field in seqdata and field not in sdb.fields(): sdb.addField(field) for field in seqdata.keys(): if field not in sdb.fields(): sdb.addField(field) sdb.addEntry(seqdata) else: self.errorLog('Summarise failed for %s' % file, printerror=False) ### ~ [3] Output Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb.saveToFile() return True except: self.errorLog('%s.batchSummarise error' % self) return False
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### forkx = len(self.list['Forked']) self.setup() ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.forking() self.printLog('#FORK','Forking of %s jobs completed.' % (rje.iStr(forkx))) except: self.errorLog('Forker.run() Error') if self.list['Forked']: self.warnLog('%s fork jobs remain unforked.' % rje.iLen(self.list['Forked'])) return False return True
def setup(self): ### Main class setup method. Makes sumfile if necessary. '''Main class setup method. Makes sumfile if necessary.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile')) if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile'])) if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile() self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile')) if os.path.exists(self.info['SumFile']) and not self.opt['Force']: if rje.yesNo('%s found. Use these results?' % self.info['SumFile']): return self.printLog('#SUM','Summary results file found. No MASCOT processing.') mapgi = False ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for mfile in self.list['ResFiles']: bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile]) bud.info['Name'] = mfile bud.readMascot() self.dict['Searches'][mfile] = bud.dict['Hits'] protacclist = rje.sortKeys(bud.dict['Hits']) for protacc in protacclist: if rje.matchExp('gi\|(\d+)',protacc): mapgi = True accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile)) self.debug(accfile) open(accfile,'w').write(string.join(protacclist,'\n')) self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist))) ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #if mapgi: # mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI') # open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n')) ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list) self.dict['Acc2Seq'] = seqlist.seqNameDic('Max') ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',') rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True) for mfile in rje.sortKeys(self.dict['Searches']): bud = self.dict['Searches'][mfile] for protacc in rje.sortKeys(bud)[0:]: protname = bud[protacc]['prot_acc'] protdesc = bud[protacc]['prot_desc'] if rje.matchExp('gi\|(\d+)',protacc): gi = rje.matchExp('gi\|(\d+)',protacc)[0] try: protname = self.dict['Acc2Seq'][gi].shortName() protdesc = self.dict['Acc2Seq'][gi].info['Description'] except: protname = 'gi_UNK__%s' % gi #x#print protname, protdesc, bud[protacc] for pep in bud[protacc]['Peptides']: data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname, 'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']} rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data) except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### forkx = len(self.list['Forked']) self.setup() ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.forking() self.printLog('#FORK', 'Forking of %s jobs completed.' % (rje.iStr(forkx))) except: self.errorLog('Forker.run() Error') if self.list['Forked']: self.warnLog('%s fork jobs remain unforked.' % rje.iLen(self.list['Forked'])) return False return True
def batchRun(self,returnobj=False): ### Execute batch mode runs '''Execute batch mode runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### barg = self.getStrLC('BatchArg') if not barg: raise ValueError('Cannot use batchrun=FILELIST if batcharg=None.') batchfiles = self.list['BatchRun'][0:] self.list['BatchRun'] = [] # Avoid recursive running! blog = self.getStr('BatchLog') if not blog.startswith('.'): blog = '.%s' % blog if not blog.endswith('.log'): blog = '%s.log' % blog rawcmd = self.cmd_list[0:] rawlog = self.log batchobj = [] ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### bx = 0 for bfile in batchfiles: bx += 1 self.printLog('#BATCH','Batch running %s of %s: %s=%s' % (rje.iStr(bx),rje.iLen(batchfiles),barg,bfile)) ## Setup parameters bbase = rje.baseFile(bfile,strip_path=True) bcmd = ['%s=%s' % (barg,bfile)] if self.getBool('BatchBase'): if blog == '.log': bcmd += ['basefile=%s' % bbase] else: bcmd += ['basefile=%s%s' % (bbase,rje.baseFile(blog))] elif self.getStrLC('BatchLog'): bcmd += ['log=%s%s' % (bbase,blog)] else: bcmd += ['newlog=F'] #self.debug(bcmd) ## Setup Seqsuite object self.cmd_list = rawcmd + bcmd self.log = rje.setLog(self.log.obj['Info'],self,self.cmd_list) # Sets up Log object for controlling log file output ## Run batchobj.append(self.run()) ## Finish and Tidy self.log = rawlog runobj = batchobj[-1] if runobj: if not returnobj: batchobj[-1] = True info = runobj.log.obj['Info'] self.printLog('#RUN','%s V%s run finished.' % (info.program,info.version)) else: self.warnLog('Batch run failed (%s=%s).' % (barg,bfile)) ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### failx = batchobj.count(False) self.printLog('#BATCH','%s batch runs complete: %s failed.' % (rje.iLen(batchfiles),rje.iStr(failx))) self.list['BatchRun'] = batchfiles return batchobj except: self.errorLog('%s.batchRun error' % self); return False
def topTerms(self,slimx=20,parents=False,total='Total',countkey='counts'): ### Selects top terms for GO slim set ''' Selects top terms for GO slim set. >> slimx:int [20] = Desired min. number of terms for each GO domain. >> parents:bool [False] = Whether parents and children both allowed in list >> total:str ['Total'] = Sample containing Total counts for assessment >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample - self.go(id)[countkey] = {Sample:count} << returns a list of GO IDs that meet criteria ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #x#self.opt['DeBug'] = True terms = [] # List of terms dom = {'cc':{},'bp':{},'mf':{}} # Dictionary of {domain:{count:[IDs]}} for id in self.go(): n = self.go(id)[countkey][total] type = self.go(id)['type'] if n not in dom[type]: dom[type][n] = [id] else: dom[type][n].append(id) ### ~ [2] ~ Generate Top Terms ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.deBug(dom) for type in dom: dterms = [] # Terms for this domain only dkeys = rje.sortKeys(dom[type]) # Counts, low to high dkeys.reverse() # Counts, high to low (dx,dtot) = (0.0,len(dkeys)) while dkeys and len(dterms) < slimx: # Keep looping self.deBug('%s: %s' % (type,dterms)) self.progLog('#TOP','Generating top %d %s terms: %.1f%%' % (slimx,type,dx/dtot)) dx += 100.0 n = dkeys.pop(0) # Remove from list dterms += dom[type][n] # Add terms to term list if parents: continue # Don't care if parents and children all mixed up for id in dterms[0:]: if id not in dterms: continue # Previously-removed parent for par in self.parents(id): # Check all parents if par in dterms: dterms.remove(par) # Remove parent term self.printLog('\r#TOP','Identified %s top %s terms: >= %s genes' % (rje.iLen(dterms),type,rje.iStr(n))) terms += dterms # Found a stable list of terms self.deBug(terms) return terms except: self.errorLog('Major problem with GO.topTerms()') return []
def classify(self): ### Generate summary tables for each protein class '''Generate summary tables for each protein class.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() rankdb = self.db('taxamap') for cfile in self.list['Classify']: pclass = rje.baseFile(cfile,strip_path=True) clist = [] for fline in open(cfile,'r').readlines(): prot = string.split(rje.chomp(fline),maxsplit=1)[0] if prot: clist.append(prot) self.printLog('#CLASS','%s "%s" class proteins read from %s' % (rje.iLen(clist),pclass,cfile)) if not clist: self.warnLog('No proteins read from %s' % (cfile)) continue classdb = db.copyTable(rankdb,pclass) classdb.dropEntriesDirect('protein',clist,inverse=True) if not classdb.entries(): self.warnLog('No "%s" proteins found in TaxaMap table' % (pclass)) continue self.summaryScores(classdb,pclass,'MinClass') except: self.errorLog('%s.classify() error' % self.prog())
def seqSubset2(self): ### Extracts sequence subset from MOUSE cDNA and Peptide libraries '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if os.path.exists('%s.map.tdt' % self.baseFile()): mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),mainkeys=['Ingolia'],name='map') else: ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt' xref = db.addTable(xfile,mainkeys=['Gene'],name='xref') afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt' self.obj['Map'] = rje_genemap.GeneMap(self.log,self.cmd_list) #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle') self.obj['Map'].loadData(['sourcedata=%s' % xfile,'aliases=%s' % afile]) ing_genes = string.split(string.join(self.db('starts').index('Gene').keys()).upper()) map = self.obj['Map'] ing_map = {} for gene in ing_genes: ing_map[gene] = map.bestMap(gene) ing_mgi = rje.sortUnique(ing_map.values()) self.printLog('#MUSG','%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes),rje.iLen(ing_mgi))) xdb = self.db('xref') bad_genes = [] for gene in ing_mgi[0:]: if gene not in xdb.data(): self.printLog('#MAP','Cannot map gene "%s" from Ingolia data!' % gene) bad_genes.append(gene); ing_mgi.remove(gene) self.printLog('#BAD','Failed to map %s genes from Ignolia' % rje.iLen(bad_genes)) open('ingolia.bad.txt','w').write(string.join(bad_genes)) ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ing_musg = xdb.dataList(xdb.entryList(ing_mgi),'EnsEMBL',sortunique=True) if '' in ing_musg: ing_musg.remove('') self.printLog('#MUSG','%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes),rje.iLen(ing_musg))) if not ing_musg: raise ValueError self.deBug(ing_musg[:10]) for stype in ['cdna','pep']: seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype if self.getBool('Force') or not os.path.exists(seqfile): seqout = 'Ingolia.%s.all.fa' % stype seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'seqout=%s' % seqout,'autofilter=T','autload=T','seqmode=file','gooddesc=%s' % string.join(ing_musg,',')] rje_seqlist.SeqList(self.log,seqcmd) mdb = self.db().addEmptyTable('map',['Ingolia','Gene','EnsEMBL'],['Ignolia']) for gene in ing_map: entry = {'Ingolia':gene,'Gene':ing_map[gene]} if entry['Gene'] in bad_genes: entry['EnsEMBL'] = '' else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL'] mdb.addEntry(entry) seqfile = 'Ingolia.cdna.all.fa' seqcmd = self.cmd_list + ['seqin=%s' % seqfile,'autofilter=F','autload=T','seqmode=file'] iseq = rje_seqlist.SeqList(self.log,seqcmd) if 'ENST' not in mdb.fields(): mdb.addField('ENST',evalue='') while iseq.nextSeq(): (iname,icdna) = iseq.getSeq() musg = rje.matchExp('gene:(\S+)',iname)[0] for entry in mdb.indexEntries('EnsEMBL',musg): if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0] else: entry['ENST'] = string.split(iname)[0] mdb.saveToFile() ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb = self.db('starts') sdb.dataFormat({'Init Codon [nt]':'int'}) icod = 'Init Codon [nt]' icon = 'Init Context [-3 to +4]' sdb.info['Name'] = 'mapped_start' sdb.addField('ENST'); sdb.addField('ENSP'); sdb.addField('ENSI'); ENST = open('IngExact.cdna.all.fa','w') ENSP = open('IngExact.pep.all.fa','w') ex = 0.0; etot = sdb.entryNum(); sx = 0; fx = 0 minpep = 20 for entry in sdb.entries(): self.progLog('\r#ING','Mapping Ignolia Harrington Starts: %.2f%%' % (ex/etot)); ex += 100.0 #self.deBug(entry) entry[icon] = entry[icon].upper() gene = entry['Gene'].upper() mentry = mdb.data(gene) entry['ENST'] = entry['ENSI'] = '' cdnaseq = peptseq = '' if not mentry or not mentry['ENST']: fx += 1; continue #self.deBug(mentry) mtype = 'fail' for trans in string.split(mentry['ENST'],','): (tname,tseq) = iseq.getDictSeq(trans,format='tuple') self.deBug('%s vs %s' % (tseq[entry[icod]-3:][:7],entry[icon])) if tseq[entry[icod]-3:][:7] == entry[icon]: ipept = string.split(rje_sequence.dna2prot(tseq[entry[icod]:]),'*')[0] self.deBug(ipept) if len(ipept) > len(peptseq): entry['ENST'] = trans cdnaseq = tseq peptseq = ipept mtype = 'exact' if not entry['ENST']: self.printLog('\r#ING','Unable to find Harrington start for %s %s (%s)' % (gene,entry[icod],entry[icon]),screen=False) fx += 1; continue elif len(peptseq) < minpep: self.printLog('\r#ING','Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene,entry[icod],entry[icon]),screen=False) fx += 1; continue id = rje.preZero(int(ex/100),etot) entry['ENSI'] = 'ENSINGT%s' % id entry['ENSP'] = 'ENSINGP%s' % id ENST.write('>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],entry['Gene'],mentry['Gene'],cdnaseq)) ENSP.write('>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id,mtype,entry['ENST'],mentry['EnsEMBL'],id,entry['Gene'],mentry['Gene'],peptseq)) sx += 1 sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile()) ENST.close(); ENSP.close() self.printLog('\r#ING','Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx),rje.iStr(fx))) return except: self.errorLog('%s.method error' % self)
def powerGO(self,numbers,sig=0.01,samples='all',total='Total',countkey='counts',ignore=[]): ### Special GO power calculation for GO slim set ''' Special GO power calculation for GO slim set. >> numbers:dictionary of {Sample:Count} >> sig:float [0.01] = Desired significance level to achieve. Currently uncorrected. Add Bonf/FDR with time. >> samples:str ['all'] = Whether sig must be achievable for 'any' or 'all' samples. >> total:str ['Total'] = Sample containing Total counts to compare against >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample - self.go(id)[countkey] = {Sample:count} >> ignore:list of Samples to ignore from calculation << returns a list of GO IDs that meet criteria ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### N = numbers[total] # Total count for calculating expectations/probabilities nlist = [] # List of counts for subsamples to be assessed for sample in numbers: if sample not in ignore + [total]: nlist.append(numbers[sample]) nlist = rje.sortUnique(nlist,xreplace=False,num=True) ### ~ [2] ~ Generate Power Range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### plist = [] # List of acceptable Total counts for subset nx = 0.0 for i in range(1,N+1): # Look at all possible levels of occurrence self.progLog('#POW','Calculating GO term power: %.1f%%' % (nx/N)) nx += 100.0 ok = 0 p = float(i) / N # Probability of each gene having this term for n in nlist: # Look at each subset k1 = min(i,n) # Want to look at largest possible count for sample-term pairing k2 = max(0,n-(N-i)) # Also want to look at the likelihood of under-representation if rje.binomial(k1,n,p,callobj=self) <= sig: ok += 1 elif (1 - rje.binomial(k2+1,n,p,callobj=self)) <= sig: ok += 1 #!# Add under-representation too! #!# if ok and samples == 'any': break if (ok and samples == 'any') or ok == len(nlist): plist.append(i) self.printLog('\r#POW','Calculation of GO term power complete.',log=False) self.deBug(nlist) ### ~ [3] ~ Generate GO Slim ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### terms = [] (ix,itot) = (0.0,len(self.go())) for id in rje.sortKeys(self.go()): self.progLog('#POW','Assessing terms for power: %.1f%% (%s terms)' % (ix/itot,rje.iLen(terms))) ix += 100.0 if self.go(id)[countkey][total] in plist: terms.append(id) self.printLog('\r#POW','Assessed terms for statistical power, p <= %s: %s GO terms' % (sig,rje.iLen(terms))) #!# Add correction terms #!# self.deBug(terms) return terms except: self.errorLog('Major problem with GO.powerGO()') return []
def batchRun(self, returnobj=False): ### Execute batch mode runs '''Execute batch mode runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### barg = self.getStrLC('BatchArg') if not barg: raise ValueError( 'Cannot use batchrun=FILELIST if batcharg=None.') batchfiles = self.list['BatchRun'][0:] self.list['BatchRun'] = [] # Avoid recursive running! blog = self.getStr('BatchLog') if not blog.startswith('.'): blog = '.%s' % blog if not blog.endswith('.log'): blog = '%s.log' % blog rawcmd = self.cmd_list[0:] rawlog = self.log batchobj = [] ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### bx = 0 for bfile in batchfiles: bx += 1 self.printLog( '#BATCH', 'Batch running %s of %s: %s=%s' % (rje.iStr(bx), rje.iLen(batchfiles), barg, bfile)) ## Setup parameters bbase = rje.baseFile(bfile, strip_path=True) bcmd = ['%s=%s' % (barg, bfile)] if self.getBool('BatchBase'): if blog == '.log': bcmd += ['basefile=%s' % bbase] else: bcmd += ['basefile=%s%s' % (bbase, rje.baseFile(blog))] elif self.getStrLC('BatchLog'): bcmd += ['log=%s%s' % (bbase, blog)] else: bcmd += ['newlog=F'] #self.debug(bcmd) ## Setup Seqsuite object self.cmd_list = rawcmd + bcmd self.log = rje.setLog( self.log.obj['Info'], self, self.cmd_list ) # Sets up Log object for controlling log file output ## Run batchobj.append(self.run()) ## Finish and Tidy self.log = rawlog runobj = batchobj[-1] if runobj: if not returnobj: batchobj[-1] = True info = runobj.log.obj['Info'] self.printLog( '#RUN', '%s V%s run finished.' % (info.program, info.version)) else: self.warnLog('Batch run failed (%s=%s).' % (barg, bfile)) ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### failx = batchobj.count(False) self.printLog( '#BATCH', '%s batch runs complete: %s failed.' % (rje.iLen(batchfiles), rje.iStr(failx))) self.list['BatchRun'] = batchfiles return batchobj except: self.errorLog('%s.batchRun error' % self) return False
def setup(self,force=False,parents=True): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getBool('Setup') and not force: self.printLog('#SETUP','Taxonomy setup already complete.'); return True if not self.setupSourceData(): raise IOError if not self.getStrLC('Basefile'): if self.getBool('BatchMode'): self.setBaseFile('batch') elif self.list['TaxIn']: self.setBaseFile(rje.baseFile(self.list['TaxIn'][0],strip_path=True)) self.list['TaxOut'] = string.join(self.list['TaxOut']).lower().split() if 'all' in self.list['TaxOut']: self.list['TaxOut'] = ['taxid','spcode','name','common'] self.list['RankID'] = [] ### ~ [2] TaxMap Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxmap = self.dict['TaxMap'] = {} tx = 0; px = 0; fx = 0 for tline in open(self.getStr('TaxMap'),'r').readlines(): self.progLog('\r#TAXID','Reading %s: %s TaxID' % (self.getStr('TaxMap'),rje.iStr(tx))) #try: (child,parent,taxtype) = rje.matchExp('^(\d+)\s+\|\s+(\d+)\s+\|\s+(\S+)\s+',tline) try: (child,parent,taxtype) = string.split(tline,'\t|\t')[:3] except: fx += 1; self.debug(tline); continue self.dict['Rank'][child] = taxtype if parent not in taxmap: taxmap[parent] = [] if not taxmap[parent]: px += 1 if taxtype in self.list['RankTypes']: self.list['RankID'].append(child) if child not in taxmap: taxmap[child] = [] taxmap[parent].append(child); tx += 1 if child in self.dict['Parent']: self.warnLog('Child TaxID "%s" already has parent!' % child) if parents and child != parent: self.dict['Parent'][child] = parent self.printLog('\r#TAXID','%s TaxID (%s parent taxa) read from %s; %s failed.' % (rje.iStr(tx),rje.iStr(px),self.getStr('TaxMap'),rje.iStr(fx))) self.printLog('#SPEC','%s TaxID mapped to %s RankTypes' % (rje.iLen(self.list['RankID']),string.join(self.list['RankTypes'],'/'))) if self.test(): pcheck = 0 for tax in taxmap: if taxmap[tax]: pcheck += 1 self.printLog('#TEST','%s parent taxa with listed children' % rje.iStr(pcheck)) if px != pcheck: raise ValueError ### ~ [3] NameMap Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.getBool('MemSaver'): taxdict = self.dict['TaxDict'] ## ~ [3a] SpecFile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## tx = 0; cx = 0; taxid = None for tline in open(self.getStr('SpecFile'),'r').readlines(): self.progLog('\r#SPEC','Reading %s species data: %s TaxID' % (self.getStr('SpecFile'),rje.iStr(tx))) nmatch = rje.matchExp('^(\S+)\s+\S+\s+(\d+):\s+N=(\S.+)\s*$',tline) if nmatch: taxid = nmatch[1]; tx += 1 taxdict[taxid] = {'spcode': nmatch[0], 'name': nmatch[2]} elif taxid and rje.matchExp('C=(\S.+)\s*$',tline): taxdict[taxid]['common'] = rje.matchExp('C=(\S.+)\s*$',tline)[0]; cx += 1 self.printLog('\r#SPEC','%s species codes/names and %s common names read from %s.' % (rje.iStr(tx),rje.iStr(cx),self.getStr('SpecFile'))) ## ~ [3b] NCBI names.dmp ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## tx = 0 for tline in open(self.getStr('NameMap'),'r').readlines(): self.progLog('\r#SPEC','Reading %s species names: %s TaxID' % (self.getStr('NameMap'),rje.iStr(tx))) tdata = string.split(tline,'\t|\t') if not tdata[3].startswith('scientific name'): continue taxid = tdata[0] if taxid not in taxdict: taxdict[taxid] = {'name': tdata[1]}; tx += 1 self.printLog('\r#SPEC','%s extra species names read from %s.' % (rje.iStr(tx),self.getStr('NameMap'))) ### ~ [4] Species code table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### spfile = '%sspcode.%s.tdt' % (self.getStr('SourcePath'),self.getStr('SourceDate')) self.db().addTable(spfile,['Species'],name='SpCode',expect=False) ### ~ [5] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.setBool({'Setup':True}) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def inSilicoHybrid( self ): ### Filter and combine subreads from parent and output to fasta file. ''' Filter and combine subreads from parent and output to fasta file. This module generates balanced "in silico diploid" PacBio subread data from two sequenced haploid parents. Each parent must first be run through SMRTSCAPE to generate subread summary data. (This will be performed if missing. Each parent needs a `*.fofn` file of subread file names, `*.unique.tdt` unique subreads table and `*.smrt.tdt` SMRT cell identifier table.) A new set of subreads is then generated from the combined set of parent subreads. This is done by first ranking the unique subreads from each parent by length. First, the longest subread from each parent are compared and the shortest selected to be the first subread of the diploid. (The shortest is taken to minimise length differences between the two parents.) Next, the longest subread from the next parent that is no longer than the previous subread is added. This cycles, picking a read from the the parent with fewest cumulative bases each cycle. The longest subread that is no longer than the previous subread is selected. This continues until one parent runs out of subreads. Additional subreads will be added from the other parent if they reduce the difference in cumulative output for each parent. Final output will be a `*.subreads.fasta` file in which each parent has a similar total sequence content and for which the subread length distributions should also be similar. This is to overcome biases in resulting diploid assemblies, where one parent has higher quality data than the other. NOTE: If performing downstream filtering by Read Quality (RQ), this might reintroduce a bias if one parent has much higher RQ values than the other. The `rqfilter=X` setting can therefore be used to restrict output to reads with a minimum RQ value. By default this is 0.84. If you do not get enough sequence output, this setting may need to be relaxed. ''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Parent 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 1 ~~~~~~~~~~~~~~~~~~~~ #') self.printLog('#FOFN', 'Parent1: %s' % self.getStr('Parent1')) base1 = rje.baseFile(self.getStr('Parent1')) parent1 = smrtscape.SMRTSCAPE( self.log, ['genomesize=13.1e6'] + self.cmd_list + ['batch=%s' % self.getStr('Parent1'), 'basefile=%s' % base1]) parent1.setup() udb1 = parent1.udb() cdb = parent1.db('smrt', add=True, mainkeys=['Name']) cdb.dataFormat({'SMRT': 'int'}) cx = cdb.entryNum() ## ~ [0a] Parent 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 2 ~~~~~~~~~~~~~~~~~~~~ #') self.printLog('#FOFN', 'Parent2: %s' % self.getStr('Parent2')) base2 = rje.baseFile(self.getStr('Parent2')) parent2 = smrtscape.SMRTSCAPE( self.log, ['genomesize=13.1e6'] + self.cmd_list + ['batch=%s' % self.getStr('Parent2'), 'basefile=%s' % base2]) parent2.setup() udb2 = parent2.udb() cdb2 = parent2.db('smrt', add=True, mainkeys=['Name']) cdb2.dataFormat({'SMRT': 'int'}) # Shift all of the Parent2 SMRT IDs to avoid conflict with Parent1 for entry in cdb2.entries() + udb2.entries(): entry['SMRT'] = entry['SMRT'] + cx cdb = parent1.db().mergeTables(cdb, cdb2) ## ~ [0c] Output Sequence File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ DIPLOIDOCUS SUBREADS ~~~~~~~~~~~~~~~~~~~~ #' ) minlen = self.getInt('LenFilter') minrq = self.getNum('RQFilter') rqstr = '%s' % minrq filtfile = '%s.L%sRQ%s.fasta' % (self.baseFile(), minlen, rqstr[2:]) ## ~ [0d] Input Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqbatch = [] # List of SeqList objects self.printLog( '#BATCH', '%s sequence files to process.' % rje.iLen(parent1.list['Batch'] + parent2.list['Batch'])) for seqfile in parent1.list['Batch'] + parent2.list['Batch']: seqcmd = self.cmd_list + [ 'seqmode=file', 'autoload=T', 'summarise=F', 'seqin=%s' % seqfile, 'autofilter=F' ] seqbatch.append(rje_seqlist.SeqList(self.log, seqcmd)) self.printLog( '#BATCH', '%s sequence files to summarise.' % rje.iLen(seqbatch)) if not seqbatch: raise IOError( 'No batch input fasta files found! Make sure parentN=FILE settings given *.fofn.' ) ## ~ [0e] Setup subread lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elists = [ udb1.sortedEntries('Len', reverse=True), udb2.sortedEntries('Len', reverse=True) ] plen = [0, 0] # Summed lengths for each parent pseq = [0, 0] # Total sequence number for each parent prq = [0, 0] # Total sequence RQ for each parent (convert to mean) if not elists[0] or not elists[1]: raise ValueError( 'No Unique ZMW subreads for one or both parents!') lastlen = max(elists[0][0]['Len'], elists[1][0]['Len']) # Length of last selected read for elist in elists: while elist and elist[0]['RQ'] < minrq: elist.pop(0) if not elists[0] or not elists[1]: raise ValueError( 'No Unique ZMW subreads for one or both parents!') nextp = 0 # Index of next parent to use if elists[0][0]['Len'] < elists[1][0]['Len']: nextp = 1 ### ~ [1] Filter and Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Filter Unique Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## zmwlist = [] # List of (smrt,zmw) meeting filtering criteria ux = 0.0 utot = len(elists[0]) + len(elists[1]) while lastlen: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) elist = elists[nextp] while elist and elist[0]['RQ'] < minrq: elist.pop(0) ux += 100.0 if elist and elist[0]['Len'] < minlen: ux += 100.0 * len(elist) elist = [] if not elist: nextp = 1 - nextp break # Finish entry = elist.pop(0) ux += 100.0 zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos'])) plen[nextp] += entry['Len'] prq[nextp] += entry['RQ'] pseq[nextp] += 1 if plen[1 - nextp] <= plen[nextp]: nextp = 1 - nextp lastlen = entry['Len'] ## ~ [1b] Final processing of last reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## while elists[nextp]: elist = elists[nextp] while elist and elist[0]['RQ'] < minrq: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) elist.pop(0) ux += 100.0 while elist and elist[0]['Len'] >= minlen: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) entry = elist.pop(0) ux += 100.0 pdiff = rje.modulus(plen[0] - plen[1]) ediff = rje.modulus(plen[nextp] + entry['Len'] - plen[1 - nextp]) if ediff >= pdiff: elists[nextp] = [] break #Finish! zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos'])) plen[nextp] += entry['Len'] prq[nextp] += entry['RQ'] pseq[nextp] += 1 self.printLog( '\r#DIP', 'Diploidising subreads complete: %s subreads to output.' % rje.iLen(zmwlist)) self.printLog( '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' % (self.getStr('Parent1'), rje.iStr(pseq[0]), rje.iStr(plen[0]), 1.0 * plen[0] / self.getInt('GenomeSize'), prq[0] / pseq[0])) self.printLog( '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' % (self.getStr('Parent2'), rje.iStr(pseq[1]), rje.iStr(plen[1]), 1.0 * plen[1] / self.getInt('GenomeSize'), prq[1] / pseq[1])) ## ~ [1b] Extract Filtered Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## rje.backup(self, filtfile) SEQOUT = open(filtfile, 'w') sx = 0.0 stot = 0 sn = len(seqbatch) fx = 0 for seqlist in seqbatch: #>m150625_001530_42272_c100792502550000001823157609091582_s1_p0/9/0_3967 RQ=0.784 si = 100.0 / seqlist.seqNum() stot += seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#OUT', 'Extracting subreads: %.2f%%' % (sx / sn)) sx += si (name, sequence) = seqlist.getSeq(seq) try: [smrt, zmw, pos, rq] = string.split(string.replace(name, '/', ' ')) except: [smrt, zmw, pos] = string.split(string.replace(name, '/', ' ')) rq = minrq if (cdb.data(smrt)['SMRT'], int(zmw), pos) not in zmwlist: continue SEQOUT.write('>%s\n%s\n' % (name, sequence)) fx += 1 self.printLog( '\r#OUT', 'Saved %s filtered subreads to %s.' % (rje.iStr(fx), filtfile)) ### ~ [2] Summarise Filtered File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqcmd = self.cmd_list + [ 'seqmode=file', 'autoload=T', 'summarise=T', 'seqin=%s' % filtfile, 'autofilter=F' ] rje_seqlist.SeqList(self.log, seqcmd) return True except: self.errorLog('%s.run error' % self.prog()) return False
def taxaChildren(self,taxid): ### Extracts TaxID children from TaxMap file and updates RankID and TaxMap dicts. '''Extracts TaxID children from TaxMap file and updates RankID and TaxMap dicts.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### # NB. This is very slow and so reading the while. self.debug(taxid) taxmap = self.dict['TaxMap'] if taxid in taxmap: return taxmap[taxid] ### ~ [1] Parse from TaxMap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxmap[taxid] = [] for tline in os.popen('grep -e "\s%s\s" %s' % (taxid,self.getStr('TaxMap'))).readlines(): try: (child,parent,taxtype) = rje.matchExp('^(\d+)\s+\|\s+(\d+)\s+\|\s+(\S+)\s+',tline) except: continue if parent not in taxmap: taxmap[parent] = [] taxmap[parent].append(child) if taxtype in ['species','subspecies']: self.list['RankID'].append(child) self.progLog('\r#TAXID','Reading %s: %s TaxID' % (self.getStr('TaxMap'),rje.iLen(taxmap))) return taxmap[taxid] except: self.errorLog('%s.taxaChildren(%s) error' % (self,taxid)); raise
def mapToTaxID(self,taxa,nodeonly=False,rankonly=False,log=True,warn=True): ### Maps taxa onto TaxID. If taxa is a list, will process each element. '''Maps taxa onto TaxID. If taxa is a list, will process each element. Returns a list.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not taxa: return [] taxid = [] ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tlist = True try: taxa.sort() except: tlist = False if tlist: tx = 0.0; ttot = len(taxa) if ttot > 1: for t in taxa: if log: self.progLog('\r#TAXID','Mapping to TaxID: %.1f%%' % (tx/ttot)); tx += 100.0 taxid += self.mapToTaxID(t,nodeonly,rankonly,log=False) taxid = rje.sortUnique(taxid) if log: if ttot > 1: self.printLog('\r#TAXID','Mapped %s taxa to %s TaxID' % (rje.iStr(ttot),rje.iLen(taxid))) else: t = taxa[0] if log: self.progLog('\r#TAXID','Mapping %s to TaxID...' % t) taxid = rje.sortUnique(self.mapToTaxID(t,nodeonly,rankonly,log=False)) if log: self.printLog('\r#TAXID','Mapped %s to %s TaxID' % (t,rje.iLen(taxid))) return taxid ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxmap = self.dict['TaxMap']; rankid = self.list['RankID'] taxa = '%s' % taxa ## ~ [2a] Taxa ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if rje.matchExp('^(\d+)$', taxa): #if taxa not in taxmap: self.taxaChildren(taxa) #if taxa in rankid: return [taxa] if nodeonly: if taxa in rankid or not rankonly: return [taxa] else: return [] if taxa not in taxmap: if warn: self.warnLog('Cannot find TaxID %s!' % taxa,'Missing_TaxID',suppress=True) return [] parents = [taxa] while parents: taxa = parents.pop(0) #if taxa not in taxmap: self.taxaChildren(taxa) if not rankonly or taxa in rankid: taxid.append(taxa) parents += taxmap[taxa] return taxid ## ~ [2b] Species Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if taxa == string.replace(taxa.upper(),' ',''): greplines = os.popen('grep "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines() for entry in greplines: try: taxid.append(rje.matchExp('^%s\s+\S+\s+(\d+):' % taxa,entry)[0]) except: pass if not taxid and warn: self.warnLog('Cannot find Species Code "%s"!' % taxa,'Missing_SpCode',suppress=True) if len(taxid) > 1: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|'))) return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid ### ~ [3] Species name etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxa = taxa.replace('_',' ') ## ~ [3a] Grep from Uniprot ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## greplines = os.popen('grep -B 2 -i "%s" %s' % (taxa, self.getStr('SpecFile'))).readlines() gtaxid = None; comid = []; synid = [] for entry in greplines: try: gtaxid = rje.matchExp('^\S+\s+\S+\s+(\d+):',entry)[0] except: pass if rje.matchExp('s=(%s)\s*$' % taxa.lower(),entry.lower()): synid.append(gtaxid) elif rje.matchExp('c=(%s)\s*$' % taxa.lower(),entry.lower()): comid.append(gtaxid) elif rje.matchExp('=(%s)\s*$' % taxa.lower(),entry.lower()): taxid.append(gtaxid) if not taxid: taxid = comid if not taxid: taxid = synid if not taxid and warn: self.warnLog('Cannot find Taxon name "%s" in Uniprot!' % taxa,'Missing Taxon',suppress=True) if len(taxid) > 1: #self.bugPrint(string.join(greplines)) #self.debug('%s %s %s' % (taxid,comid,synid)) if warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|'))) if taxid: return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid #self.debug(taxid) ## ~ [3b] Grep from NCBI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## greplines = os.popen('grep -i -e "\t%s\t" %s' % (taxa, self.getStr('NameMap'))).readlines() for entry in greplines: try: #gtaxid = rje.matchExp('^(\d+)\s+\S\s+(\S.+)$',entry) gtaxid = string.split(entry,'\t|\t') if gtaxid[1].lower() == taxa.lower(): taxid.append(gtaxid[0]) elif gtaxid[2] and gtaxid[2].lower() == taxa.lower(): taxid.append(gtaxid[0]) except: pass if len(taxid) > 1 and warn: self.warnLog('Species Code "%s" hits %d Taxa ID (%s)' % (taxa, len(taxid), string.join(taxid,'|'))) return self.mapToTaxID(taxid,nodeonly,rankonly,log=False) #taxid except: self.errorLog('%s.mapToTaxID() error' % (self)); raise
def parseMITAB(self): ### Parse MITAB file into pairwise PPI table. '''Parse MITAB file into pairwise PPI table.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xref = self.obj['XRef'] pdb = self.db('pairwise') pfields = ['Hub','Spoke','HubUni','SpokeUni','HubTaxID','SpokeTaxID','Evidence','IType'] headers = {} for h in range(len(self.list['Headers'])): headers[self.list['Headers'][h]] = h dbsource = self.getStr('DBSource') ### ~ [2] Read through MITAB ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mx = 0; ex = 0; fax = 0; ftx = 0; fx = 0; uhx = 0; usx = 0 epos = self.endPos('MITAB') complexidlist = [] badtaxa = ['-'] baduni = [] while 1: self.progLog('\r#MITAB','Parsing %s MITAB %s: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,self.fileProg('MITAB',epos),rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist))) mline = self.readDelimit('MITAB'); mx += 1 if not mline: break entry = {'#':pdb.entryNum()} for field in pfields: entry[field] = '' ## ~ [2a] Add iRefIndex complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## complexid = {} # This will take the first complex ID if 'irigid' in self.list['Headers'] and 'numParticipants' in self.list['Headers']: if int(mline[headers['numParticipants']]) > 2: complexid['A'] = complexid['B'] = 'rigid:%s' % mline[headers['irigid']] #self.bugPrint(mline) #self.debug(complexid) ## ~ [2b] Parse and check taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## taxa = {'A':'','B':''} for tfield in self.list['TaxaField']: ab = tfield[-1:].upper() if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',tfield.upper())[0] try: taxon = rje.matchExp('^taxid:(\d+)',mline[headers[tfield]].lower())[0] if self.list['TaxID'] and taxon not in self.list['TaxID']: continue taxa[ab] = taxon except: taxon = mline[headers[tfield]] if taxon not in badtaxa: badtaxa.append(taxon) self.warnLog('No TaxID read from %s: "%s"' % (tfield,taxon),'no_tax',suppress=True) if not self.list['TaxID']: taxa[ab] = '-' if not taxa['A'] and complexid: taxa['A'] = taxa['B'] if not taxa['B'] and complexid: taxa['B'] = taxa['A'] if not (taxa['A'] and taxa['B']): ftx += 1; continue ## ~ [2c] Parse protein IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ids = {'A':[],'B':[]} uni = {'A':'','B':''} for ifield in self.list['IDField']: ab = ifield[-1:].upper() if ab == ')': ab = rje.matchExp('([AB]) \(\S+\)$',ifield.upper())[0] # Split IDs on | then db:id vs self.list['MapDB'] for pid in string.split(mline[headers[ifield]],'|'): try: (db,dbid) = string.split(pid,':',1) except: continue if db.lower() in ['uniprotkb'] and '(' in dbid: continue # Only map uniprotkb accnum dbid = string.split(dbid,'(')[0] dbid = string.split(dbid,';')[0] if db.lower() in ['uniprotkb']: svid = dbid dbid = string.split(svid,'-')[0] if ab not in complexid: # First identifier for A/B if db.lower() in self.list['Complex']: complexid[ab] = pid; ids[ab].append(pid) else: complexid[ab] = '' if not self.list['MapDB'] or db.lower() in self.list['MapDB']: ids[ab].append(dbid) # Parse uniprot directly if possible if db.lower() in ['uniprotkb'] and not uni[ab]: if self.getBool('SpliceVar'): uni[ab] = svid else: uni[ab] = dbid #self.bugPrint(ids) ## ~ [2d] Map parsed IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## amb = {'A':False,'B':False} if not ids['A'] or not ids['B']: #self.bugPrint('%s\n=> ID Failure' % mline) #self.bugPrint(ids['A']); self.bugPrint(ids['B']) #self.bugPrint(entry) fx += 1; continue for ida in ids['A']: #self.debug('%s => %s (or %s)' % (ida,xref.xref(ida,unique=True),xref.xref(ida,unique=False))) if not entry['Hub']: entry['Hub'] = xref.xref(ida,unique=True,usedict=True) if entry['Hub'] == False: amb['A'] = True #if not entry['HubUni']: entry['HubUni'] = xref.xref(ida,self.getStr('UniField'),unique=True,usedict=True) if not entry['HubUni']: entry['HubUni'] = self.getUniXRef(ida) if self.getBool('AddUni') and not entry['HubUni']: entry['HubUni'] = uni['A'] if uni['A'] and uni['A'] not in baduni: baduni.append(uni['A']) if not entry['Hub'] and entry['HubUni']: entry['Hub'] = entry['HubUni'] #self.warnLog('UniprotKB "%s" used for Hub' % entry['HubUni'],'unihub',suppress=True) uhx += 1 if not entry['Hub'] and complexid['A']: entry['Hub'] = complexid['A'] else: complexid['A'] = '' if self.getBool('UniOnly') and not complexid['A'] and not entry['HubUni']: entry['Hub'] = '' for idb in ids['B']: if not entry['Spoke']: entry['Spoke'] = xref.xref(idb,unique=True,usedict=True) if entry['Spoke'] == False: amb['B'] = True #if not entry['SpokeUni']: entry['SpokeUni'] = xref.xref(idb,self.getStr('UniField'),unique=True,usedict=True) if not entry['SpokeUni']: entry['SpokeUni'] = self.getUniXRef(idb) if self.getBool('AddUni') and not entry['SpokeUni']: entry['SpokeUni'] = uni['B'] if not entry['Spoke'] and entry['SpokeUni']: entry['Spoke'] = entry['SpokeUni'] #self.warnLog('UniprotKB "%s" used for Spoke' % entry['SpokeUni'],'unihub',suppress=True) usx += 1 if not entry['Spoke'] and complexid['B']: entry['Spoke'] = complexid['B'] else: complexid['B'] = '' if self.getBool('UniOnly') and not complexid['B'] and not entry['SpokeUni']: entry['Spoke'] = '' if uni['B'] and uni['B'] not in baduni: baduni.append(uni['B']) if complexid['A'] and complexid['B']: if not (complexid['A'].startswith('rigid:') and complexid['B'].startswith('rigid:')): self.printLog('\r#MITAB','',log=False) self.warnLog('Cannot parse complex:complex PPI (%s & %s)' % (complexid['A'],complexid['B']),'complex-complex',suppress=True) entry['Hub'] = entry['Spoke'] = '' #self.bugPrint(entry) #self.debug(complexid) if not (entry['Hub'] and entry['Spoke']): if (entry['Hub'] or amb['A']) and (entry['Spoke'] or amb['B']): fax += 1; continue #self.bugPrint(mline); self.debug(entry) fx += 1; continue #if self.dev() and 'PCNA' not in [entry['Hub'],entry['Spoke']]: continue entry['HubTaxID'] = taxa['A'] entry['SpokeTaxID'] = taxa['B'] if complexid['A'] and complexid['A'] not in complexidlist: complexidlist.append(complexid['A']) if complexid['B'] and complexid['B'] not in complexidlist: complexidlist.append(complexid['B']) #if complexid['A'] or complexid['B']: self.debug(entry) ## ~ [2c] Parse evidence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #self.bugPrint(mline) evidence = [] for tfield in self.list['MethodField']: #self.bugPrint(string.split(mline[headers[tfield]],'|')) for etype in string.split(mline[headers[tfield]],'|'): ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype) if ematch: evidence.append('%s:%s' % (dbsource,ematch[0])) if not evidence: evidence.append('%s:unknown' % (self.getStr('DBSource'))) evidence = rje.sortUnique(evidence) #self.debug(evidence) entry['Evidence'] = string.join(evidence,'|') ## ~ [2d] Parse interaction types ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## itypes = [] for tfield in self.list['TypeField']: #self.bugPrint(string.split(mline[headers[tfield]],'|')) for etype in string.split(mline[headers[tfield]],'|'): ematch = rje.matchExp('MI:\d+"?\((.+)\)',etype) if ematch: itypes.append(ematch[0]) if not itypes: itypes.append('unknown') itypes = rje.sortUnique(itypes) #self.debug(itypes) entry['IType'] = string.join(itypes,'|') pdb.addEntry(entry); ex += 1 if self.dev() and entry['Hub'] in ['KLF3']:#,'WDR5']: self.printLog('#DEV',string.join(mline,'\t')) #self.bugPrint(uni); self.debug(entry) if self.getBool('Symmetry') and not complexid['A'] and not complexid['B']: pdb.addEntry({'#':pdb.entryNum(),'Hub':entry['Spoke'],'Spoke':entry['Hub'], 'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'], 'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'], 'Evidence':entry['Evidence'],'IType':entry['IType']}) self.printLog('\r#MITAB','Parsing %s MITAB complete: %s lines; %s ppi; %s taxa-filtered; %s ambiguous; %s failed; %s complexes.' % (dbsource,rje.iStr(mx),rje.iStr(ex),rje.iStr(ftx),rje.iStr(fax),rje.iStr(fx),rje.iLen(complexidlist))) self.close('MITAB') if (uhx+usx): self.warnLog('UniprotKB IDs used for %s Hub and %s Spoke IDs.' % (rje.iStr(uhx),rje.iStr(usx))) if baduni: baduni.sort() accout = '%s.%s.unmapped.uniacc' % (self.baseFile(),dbsource) self.warnLog('%s unmapped UniprotKB IDs used: output to %s.' % (rje.iLen(baduni),accout)) open(accout,'w').write(string.join(baduni,'\n')) ### ~ [3] Convert complexes to pairwise PPIs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not complexidlist: return pdb self.printLog('#CPLEX','%s complex IDs parsed to convert to pairwise PPI.' % rje.iLen(complexidlist)) ## ~ [3a] Assemble complex memberships ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## complexes = {}; chentries = []; csentries = [] cevidence = {} # List of Evidence for each complex citypes = {} # List of ITypes for each complex ctaxa = {} ex = 0.0; etot = pdb.entryNum() for entry in pdb.entries(): self.progLog('\r#CPLEX','Assembling complexes: %.1f%%' % (ex/etot)); ex += 100.0 if entry['Hub'] in complexidlist: cid = entry['Hub'] if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = [] complexes[cid].append(entry['Spoke']) ctaxa[entry['Spoke']] = entry['SpokeTaxID'] cevidence[cid].append(entry['Evidence']) citypes[cid].append(entry['IType']) chentries.append(entry) elif entry['Spoke'] in complexidlist: cid = entry['Spoke'] if cid not in complexes: complexes[cid] = []; cevidence[cid] = []; citypes[cid] = [] complexes[cid].append(entry['Hub']) ctaxa[entry['Hub']] = entry['HubTaxID'] cevidence[cid].append(entry['Evidence']) citypes[cid].append(entry['IType']) csentries.append(entry) self.printLog('\r#CPLEX','Assembled %s of %s complexes.' % (rje.iLen(complexes),rje.iLen(complexidlist))) #self.debug(complexes) ## ~ [3b] Update complexes dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cppi = {} ex = 0.0; etot = len(complexes); rx = 0; px = 0; cmax = 0 for cid in rje.sortKeys(complexes): self.progLog('\r#CPLEX','Reducing complexes: %.1f%%' % (ex/etot)); ex += 100.0 if self.dev(): self.printLog('#DEV','Complex %s: %s' % (cid,complexes[cid])) if len(complexes[cid]) < 2: complexes.pop(cid) cevidence.pop(cid) citypes.pop(cid) rx += 1; continue complexes[cid].sort() #cevidence[cid] = string.join(rje.sortUnique(cevidence[cid]),'|') #citypes[cid] = string.join(rje.sortUnique(citypes[cid]),'|') cmax = max(cmax,len(complexes[cid])) #px += (len(complexes[cid]) * (len(complexes[cid])-1)) members = complexes[cid][0:] while members: hub = members.pop(0) if self.dev() and hub == 'KLF3': self.debug(cid) if hub not in cppi: cppi[hub] = {} for spoke in members: if spoke not in cppi[hub]: cppi[hub][spoke] = []; px += 1 cppi[hub][spoke].append(cid) self.printLog('\r#CPLEX','Reduced %s complexes to %s > 1 member: %s ppi to add.' % (rje.iStr(etot),rje.iLen(complexes),rje.iStr(px))) ## ~ [3c] Update pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cix = pdb.entryNum() for centry in chentries + csentries: pdb.dropEntry(centry) ex = 0.0; etot = len(cppi) for hub in rje.sortKeys(cppi): self.progLog('\r#CPLEX','Expanding complexes: %.1f%%' % (ex/etot)); ex += 100.0 #hentry = {'Hub':hub,'HubUni':xref.xref(hub,self.getStr('UniField'),unique=True,usedict=True),'HubTaxID':ctaxa[hub]} hentry = {'Hub':hub,'HubUni':self.getUniXRef(hub),'HubTaxID':ctaxa[hub]} for spoke in rje.sortKeys(cppi[hub]): evidence = [] itypes = [] ctypes = [] for cid in cppi[hub][spoke]: evidence += cevidence[cid] itypes += citypes[cid] ctypes += string.split(cid,':')[0] ctype = string.join(rje.sortUnique(ctypes),'|') evidence = string.join(rje.sortUnique(evidence),'|') if not evidence: evidence = '%s:%s' % (dbsource,ctype) itypes = string.join(rje.sortUnique(itypes),'|') if not itypes: itypes = ctype #newentry = {'#':cix,'Spoke':spoke,'SpokeUni':xref.xref(spoke,self.getStr('UniField'),unique=True,usedict=True),'SpokeTaxID':ctaxa[spoke]} newentry = {'#':cix,'Spoke':spoke,'SpokeUni':self.getUniXRef(spoke),'SpokeTaxID':ctaxa[spoke]} newentry['Evidence'] = evidence newentry['IType'] = itypes entry = pdb.addEntry(rje.combineDict(newentry,hentry,overwrite=False)); cix += 1 if self.dev() and entry['Hub'] in ['KLF3','WDR5']: self.debug('Complex: %s' % entry) if self.getBool('Symmetry'): pdb.addEntry({'#':cix,'Hub':entry['Spoke'],'Spoke':entry['Hub'], 'HubUni':entry['SpokeUni'],'SpokeUni':entry['HubUni'], 'HubTaxID':entry['SpokeTaxID'],'SpokeTaxID':entry['HubTaxID'], 'Evidence':entry['Evidence'],'IType':entry['IType']}) cix += 1 self.printLog('#CPLEX','%s complex IDs expanded to pairwise PPI => %s ppi (symmetry=%s).' % (rje.iLen(complexidlist),rje.iStr(pdb.entryNum()),self.getBool('Symmetry'))) return pdb except: self.errorLog('%s.parseMITAB error' % self.prog())
def _positiveAndNegativePeptides(self): ### Populates PosPep and NegPep Lists '''Populates PosPep and NegPep Lists.''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pfile = '%s.peptides.tdt' % self.basefile() #if rje.exists(pfile) and not self.getBool('Force'): # try: # pdb = self.db().addTable(pfile,['Peptide'],name='Peptides') # pdb.dataFormat(reformat={'Len':'int','MWt':'num','Cys':'int','Ser':'int','Hyd':'num'}) # self.list['Peptides'] = self.list['PosPep'] = pdb.index('Pos')['Y'] # self.list['NegPep'] = pdb.index('Positive')['Neg'] # return pdb # except: pass if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False self.list['Peptides'] = peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True) seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T']) pdb = self.db().addEmptyTable('Peptides',['Peptide','NR','Pos','Len','MWt','C','HPW','DENQ','M','Hyd'],['Peptide']) ### ~ [1] ~ Digest Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### protease = self.getStr('PepCut') self.list['PosPep'] = poslist = []; self.list['NegPep'] = neglist = []; sx = 0.0; stot = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0 sequence = seq.getSequence() for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) frag = string.split(sequence,':') while '' in frag: frag.remove('') if not self.getBool('NTerm'): frag = frag[1:] for pep in frag[0:]: if pep not in poslist: poslist.append(pep) self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist))) ## ~ [1b] ~ Peptide Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## allpep = []; self.list['Redundant'] = redundant = [] sx = 0.0; stot = self.obj['SeqList'].seqNum() for seq in self.obj['SeqList'].seqs(): self.progLog('\r#DIG','%s Digesting sequences: %.2f%%' % (protease,sx/stot)); sx += 100.0 sequence = seq.getSequence() for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) for frag in string.split(sequence,':'): if frag in allpep: redundant.append(frag) else: allpep.append(frag) self.printLog('\r#DIG','%s Digesting %s sequences complete.' % (protease,rje.iStr(stot))) ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## px = 0.0; ptot = len(poslist) for pep in poslist[0:]: self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0 entry = {'Peptide':pep,'MWt':rje_sequence.MWt(pep),'Hyd':rje_sequence.eisenbergHydropathy(pep,returnlist=False), 'Len':len(pep),'NR':'Y','Pos':'Y'} if pep not in peplist: poslist.remove(pep); neglist.append(pep); entry['Pos'] = 'N' if pep in redundant: entry['NR'] = 'N' for aacomb in ['C','HPW','DENQ','M']: x = 0 for a in aacomb: x += pep.count(a) entry[aacomb] = x pdb.addEntry(entry) self.printLog('\r#PEP','Processing positive peptides (%s) complete: %s Pos; %s Neg.' % (protease,rje.iLen(poslist),rje.iLen(neglist))) ### ~ [2] ~ Save Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdb.saveToFile(pfile) POS = open('%s.positives.fas' % self.basefile(),'w'); NEG = open('%s.negatives.fas' % self.basefile(),'w') for pep in poslist: POS.write('>%s\n%s\n' % (pep,pep)) for pep in neglist: NEG.write('>%s\n%s\n' % (pep,pep)) POS.close(); self.printLog('#FAS','%s peptides output to %s.positives.fas' % (rje.iLen(poslist),self.basefile())) NEG.close(); self.printLog('#FAS','%s peptides output to %s.negatives.fas' % (rje.iLen(neglist),self.basefile())) return pdb except: self.errorLog('Problem during %s._positiveAndNegativePeptides().' % self); return None # Setup failed
def runXML(self): ### Generic method ''' Generic method. Add description here (and arguments.) ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datatype = self.getStr('DataType').lower() exps = {} # Experiment alias: run directory list runs = {} # Run alias: file list run2run = {} # Convert runs to run aliases ## ~ [1a] Get Files and Directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## dirlist = rje.listDir(self, subfolders=False, folders=True, files=False, summary=True) filelist = rje.listDir(self, folder=os.getcwd(), subfolders=True, folders=False, files=True, summary=True) current = os.getcwd() curlen = len(current) + 1 for pathlist in [dirlist, filelist]: for i in range(len(pathlist)): path = pathlist[i] if path.startswith(current): pathlist[i] = path[curlen:] else: raise ValueError(path) ## ~ [1b] Clean up files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## dx = len(dirlist) self.debug(filelist) self.debug(dirlist) self.debug(self.list['DirList']) if self.list['DirList']: dirlist = rje.listIntersect(dirlist, self.list['DirList']) self.printLog('#DIR', 'Process %d of %d directories' % (len(dirlist), dx)) keepext = [] if datatype == 'pacbio': keepext = ['.h5', '.xml'] for filename in filelist[0:]: ext = os.path.splitext(filename)[1] if len(string.split( filename, os.sep)) < 2 or ext not in keepext or string.split( filename, os.sep)[0] not in dirlist: filelist.remove(filename) self.printLog( '#FILES', '%s files kept from %s directories' % (rje.iLen(filelist), rje.iLen(dirlist))) self.debug(filelist[:10]) self.debug(filelist[-10:]) ### ~ [2] Parse runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if datatype == 'pacbio': for filename in filelist[0:]: self.printLog('#FILE', filename) filedata = string.split(filename, os.sep) parent = filedata[ 0] # This should be a directory containing runs experiment = filedata[1] expalias = string.join( string.split(experiment, '.')[:2], '.') run = string.join(filedata[1:3], os.sep) if expalias not in exps: exps[expalias] = [] if run not in exps[expalias]: exps[expalias].append(run) runalias = '%s-%d' % (expalias, len(exps[expalias])) run2run[run] = runalias runfile = filedata[-1] if runalias not in runs: runs[runalias] = [] runs[runalias].append(filename) self.printLog( '#PARSE', '%s - %s: (%d) %s' % (expalias, runalias, len(runs[runalias]), filename)) ### ~ [3] Generate XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [3a] Experiment XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## efile = '%s.exp.xml' % self.baseFile() elines = [ '<?xml version="1.0" encoding="UTF-8"?>', '<EXPERIMENT_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.experiment.xsd">' ] for experiment in rje.sortKeys(exps): ex = experiment[-1:] elines += [ ' <EXPERIMENT alias="%s" center_name="">' % experiment, ' <TITLE>Cane toad whole genome sequencing - PacBio library %s</TITLE>' % ex, ' <STUDY_REF accession="ERP106543"/>', ' <DESIGN>', ' <DESIGN_DESCRIPTION/>', ' <SAMPLE_DESCRIPTOR accession="ERS2169570"/>', ' <LIBRARY_DESCRIPTOR>', ' <LIBRARY_NAME>%s</LIBRARY_NAME>' % experiment, ' <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>', ' <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>', ' <LIBRARY_SELECTION>size fractionation</LIBRARY_SELECTION>', ' <LIBRARY_LAYOUT>', ' <SINGLE/>', ' </LIBRARY_LAYOUT>', ' <LIBRARY_CONSTRUCTION_PROTOCOL></LIBRARY_CONSTRUCTION_PROTOCOL>', ' </LIBRARY_DESCRIPTOR>', ' </DESIGN>', ' <PLATFORM>', ' <PACBIO_SMRT>', ' <INSTRUMENT_MODEL>PacBio RS II</INSTRUMENT_MODEL>', ' </PACBIO_SMRT>', ' </PLATFORM>', ' <EXPERIMENT_ATTRIBUTES>', ' <EXPERIMENT_ATTRIBUTE>', ' <TAG>Size selection</TAG>', ' <VALUE>15-50 kb</VALUE>', ' </EXPERIMENT_ATTRIBUTE>', ' <EXPERIMENT_ATTRIBUTE>', ' <TAG>Sequencing Chemistry</TAG>', ' <VALUE>P6C4</VALUE>', ' </EXPERIMENT_ATTRIBUTE>', ' </EXPERIMENT_ATTRIBUTES>', ' </EXPERIMENT>' ] elines += ['</EXPERIMENT_SET>'] open(efile, 'w').write(string.join(elines, '\n')) self.printLog('#EXP', 'Experiment data saved to %s' % efile) ## ~ [3b] Run XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## rfile = '%s.run.xml' % self.baseFile() rlines = [ '<?xml version="1.0" encoding="UTF-8"?>', '<RUN_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.run.xsd">' ] for experiment in rje.sortKeys(exps): for run in exps[experiment]: runalias = run2run[run] rlines += [ ' <RUN alias="%s" center_name="">' % runalias, ' <EXPERIMENT_REF refname="%s"/>' % experiment, ' <DATA_BLOCK>', ' <FILES>' ] for filename in runs[runalias]: rlines += [ ' <FILE filename="%s" filetype="PacBio_HDF5">' % filename, ' </FILE>' ] rlines += [ ' </FILES>', ' </DATA_BLOCK>', ' </RUN>' ] rlines += ['</RUN_SET>'] open(rfile, 'w').write(string.join(rlines, '\n')) self.printLog('#RUN', 'Run data saved to %s' % rfile) ## ~ [3c] Submission XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## xfile = '%s.xml' % self.baseFile() xlines = [ '<?xml version="1.0" encoding="UTF-8"?>', '<SUBMISSION alias="%s" center_name="">' % self.baseFile(), ' <ACTIONS>', ' <ACTION>', ' <ADD source="%s" schema="experiment"/>' % efile, ' </ACTION>', ' <ACTION>', ' <ADD source="%s" schema="run"/>' % rfile, ' </ACTION>', ' </ACTIONS>', '</SUBMISSION>' ] open(xfile, 'w').write(string.join(xlines, '\n')) self.printLog('#SUBXML', 'Submission XML saved to %s' % xfile) return except: self.errorLog('%s.method error' % self.prog())
def runXML(self): ### Generic method ''' Generic method. Add description here (and arguments.) ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datatype = self.getStr('DataType').lower() exps = {} # Experiment alias: run directory list runs = {} # Run alias: file list run2run = {} # Convert runs to run aliases ## ~ [1a] Get Files and Directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## dirlist = rje.listDir(self,subfolders=False,folders=True,files=False,summary=True) filelist = rje.listDir(self,folder=os.getcwd(),subfolders=True,folders=False,files=True,summary=True) current = os.getcwd() curlen = len(current) + 1 for pathlist in [dirlist,filelist]: for i in range(len(pathlist)): path = pathlist[i] if path.startswith(current): pathlist[i] = path[curlen:] else: raise ValueError(path) ## ~ [1b] Clean up files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## dx = len(dirlist) self.debug(filelist) self.debug(dirlist) self.debug(self.list['DirList']) if self.list['DirList']: dirlist = rje.listIntersect(dirlist,self.list['DirList']) self.printLog('#DIR','Process %d of %d directories' % (len(dirlist),dx)) keepext = [] if datatype == 'pacbio': keepext = ['.h5','.xml'] for filename in filelist[0:]: ext = os.path.splitext(filename)[1] if len(string.split(filename,os.sep)) < 2 or ext not in keepext or string.split(filename,os.sep)[0] not in dirlist: filelist.remove(filename) self.printLog('#FILES','%s files kept from %s directories' % (rje.iLen(filelist),rje.iLen(dirlist))) self.debug(filelist[:10]) self.debug(filelist[-10:]) ### ~ [2] Parse runs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if datatype == 'pacbio': for filename in filelist[0:]: self.printLog('#FILE',filename) filedata = string.split(filename,os.sep) parent = filedata[0] # This should be a directory containing runs experiment = filedata[1] expalias = string.join(string.split(experiment,'.')[:2],'.') run = string.join(filedata[1:3],os.sep) if expalias not in exps: exps[expalias] = [] if run not in exps[expalias]: exps[expalias].append(run) runalias = '%s-%d' % (expalias,len(exps[expalias])) run2run[run] = runalias runfile = filedata[-1] if runalias not in runs: runs[runalias] = [] runs[runalias].append(filename) self.printLog('#PARSE','%s - %s: (%d) %s' % (expalias,runalias,len(runs[runalias]),filename)) ### ~ [3] Generate XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [3a] Experiment XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## efile = '%s.exp.xml' % self.baseFile() elines = ['<?xml version="1.0" encoding="UTF-8"?>','<EXPERIMENT_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.experiment.xsd">'] for experiment in rje.sortKeys(exps): ex = experiment[-1:] elines += [' <EXPERIMENT alias="%s" center_name="">' % experiment, ' <TITLE>Cane toad whole genome sequencing - PacBio library %s</TITLE>' % ex, ' <STUDY_REF accession="ERP106543"/>', ' <DESIGN>', ' <DESIGN_DESCRIPTION/>', ' <SAMPLE_DESCRIPTOR accession="ERS2169570"/>', ' <LIBRARY_DESCRIPTOR>', ' <LIBRARY_NAME>%s</LIBRARY_NAME>' % experiment, ' <LIBRARY_STRATEGY>WGS</LIBRARY_STRATEGY>', ' <LIBRARY_SOURCE>GENOMIC</LIBRARY_SOURCE>', ' <LIBRARY_SELECTION>size fractionation</LIBRARY_SELECTION>', ' <LIBRARY_LAYOUT>', ' <SINGLE/>', ' </LIBRARY_LAYOUT>', ' <LIBRARY_CONSTRUCTION_PROTOCOL></LIBRARY_CONSTRUCTION_PROTOCOL>', ' </LIBRARY_DESCRIPTOR>', ' </DESIGN>', ' <PLATFORM>', ' <PACBIO_SMRT>', ' <INSTRUMENT_MODEL>PacBio RS II</INSTRUMENT_MODEL>', ' </PACBIO_SMRT>', ' </PLATFORM>', ' <EXPERIMENT_ATTRIBUTES>', ' <EXPERIMENT_ATTRIBUTE>', ' <TAG>Size selection</TAG>', ' <VALUE>15-50 kb</VALUE>', ' </EXPERIMENT_ATTRIBUTE>', ' <EXPERIMENT_ATTRIBUTE>', ' <TAG>Sequencing Chemistry</TAG>', ' <VALUE>P6C4</VALUE>', ' </EXPERIMENT_ATTRIBUTE>', ' </EXPERIMENT_ATTRIBUTES>', ' </EXPERIMENT>'] elines += ['</EXPERIMENT_SET>'] open(efile,'w').write(string.join(elines,'\n')) self.printLog('#EXP','Experiment data saved to %s' % efile) ## ~ [3b] Run XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## rfile = '%s.run.xml' % self.baseFile() rlines = ['<?xml version="1.0" encoding="UTF-8"?>','<RUN_SET xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="ftp://ftp.sra.ebi.ac.uk/meta/xsd/sra_1_5/SRA.run.xsd">'] for experiment in rje.sortKeys(exps): for run in exps[experiment]: runalias = run2run[run] rlines += [' <RUN alias="%s" center_name="">' % runalias,' <EXPERIMENT_REF refname="%s"/>' % experiment, ' <DATA_BLOCK>',' <FILES>'] for filename in runs[runalias]: rlines += [' <FILE filename="%s" filetype="PacBio_HDF5">' % filename,' </FILE>'] rlines += [' </FILES>',' </DATA_BLOCK>',' </RUN>'] rlines += ['</RUN_SET>'] open(rfile,'w').write(string.join(rlines,'\n')) self.printLog('#RUN','Run data saved to %s' % rfile) ## ~ [3c] Submission XML ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## xfile = '%s.xml' % self.baseFile() xlines = ['<?xml version="1.0" encoding="UTF-8"?>','<SUBMISSION alias="%s" center_name="">' % self.baseFile(), ' <ACTIONS>',' <ACTION>',' <ADD source="%s" schema="experiment"/>' % efile, ' </ACTION>',' <ACTION>',' <ADD source="%s" schema="run"/>' % rfile, ' </ACTION>',' </ACTIONS>','</SUBMISSION>'] open(xfile,'w').write(string.join(xlines,'\n')) self.printLog('#SUBXML','Submission XML saved to %s' % xfile) return except: self.errorLog('%s.method error' % self.prog())
def seqSubset2( self ): ### Extracts sequence subset from MOUSE cDNA and Peptide libraries '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if os.path.exists('%s.map.tdt' % self.baseFile()): mdb = self.db().addTable('%s.map.tdt' % self.baseFile(), mainkeys=['Ingolia'], name='map') else: ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt' xref = db.addTable(xfile, mainkeys=['Gene'], name='xref') afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt' self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list) #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle') self.obj['Map'].loadData( ['sourcedata=%s' % xfile, 'aliases=%s' % afile]) ing_genes = string.split( string.join( self.db('starts').index('Gene').keys()).upper()) map = self.obj['Map'] ing_map = {} for gene in ing_genes: ing_map[gene] = map.bestMap(gene) ing_mgi = rje.sortUnique(ing_map.values()) self.printLog( '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes), rje.iLen(ing_mgi))) xdb = self.db('xref') bad_genes = [] for gene in ing_mgi[0:]: if gene not in xdb.data(): self.printLog( '#MAP', 'Cannot map gene "%s" from Ingolia data!' % gene) bad_genes.append(gene) ing_mgi.remove(gene) self.printLog( '#BAD', 'Failed to map %s genes from Ignolia' % rje.iLen(bad_genes)) open('ingolia.bad.txt', 'w').write(string.join(bad_genes)) ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ing_musg = xdb.dataList(xdb.entryList(ing_mgi), 'EnsEMBL', sortunique=True) if '' in ing_musg: ing_musg.remove('') self.printLog( '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes), rje.iLen(ing_musg))) if not ing_musg: raise ValueError self.deBug(ing_musg[:10]) for stype in ['cdna', 'pep']: seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype if self.getBool('Force') or not os.path.exists(seqfile): seqout = 'Ingolia.%s.all.fa' % stype seqcmd = self.cmd_list + [ 'seqin=%s' % seqfile, 'seqout=%s' % seqout, 'autofilter=T', 'autload=T', 'seqmode=file', 'gooddesc=%s' % string.join(ing_musg, ',') ] rje_seqlist.SeqList(self.log, seqcmd) mdb = self.db().addEmptyTable('map', ['Ingolia', 'Gene', 'EnsEMBL'], ['Ignolia']) for gene in ing_map: entry = {'Ingolia': gene, 'Gene': ing_map[gene]} if entry['Gene'] in bad_genes: entry['EnsEMBL'] = '' else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL'] mdb.addEntry(entry) seqfile = 'Ingolia.cdna.all.fa' seqcmd = self.cmd_list + [ 'seqin=%s' % seqfile, 'autofilter=F', 'autload=T', 'seqmode=file' ] iseq = rje_seqlist.SeqList(self.log, seqcmd) if 'ENST' not in mdb.fields(): mdb.addField('ENST', evalue='') while iseq.nextSeq(): (iname, icdna) = iseq.getSeq() musg = rje.matchExp('gene:(\S+)', iname)[0] for entry in mdb.indexEntries('EnsEMBL', musg): if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0] else: entry['ENST'] = string.split(iname)[0] mdb.saveToFile() ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb = self.db('starts') sdb.dataFormat({'Init Codon [nt]': 'int'}) icod = 'Init Codon [nt]' icon = 'Init Context [-3 to +4]' sdb.info['Name'] = 'mapped_start' sdb.addField('ENST') sdb.addField('ENSP') sdb.addField('ENSI') ENST = open('IngExact.cdna.all.fa', 'w') ENSP = open('IngExact.pep.all.fa', 'w') ex = 0.0 etot = sdb.entryNum() sx = 0 fx = 0 minpep = 20 for entry in sdb.entries(): self.progLog( '\r#ING', 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot)) ex += 100.0 #self.deBug(entry) entry[icon] = entry[icon].upper() gene = entry['Gene'].upper() mentry = mdb.data(gene) entry['ENST'] = entry['ENSI'] = '' cdnaseq = peptseq = '' if not mentry or not mentry['ENST']: fx += 1 continue #self.deBug(mentry) mtype = 'fail' for trans in string.split(mentry['ENST'], ','): (tname, tseq) = iseq.getDictSeq(trans, format='tuple') self.deBug('%s vs %s' % (tseq[entry[icod] - 3:][:7], entry[icon])) if tseq[entry[icod] - 3:][:7] == entry[icon]: ipept = string.split( rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0] self.deBug(ipept) if len(ipept) > len(peptseq): entry['ENST'] = trans cdnaseq = tseq peptseq = ipept mtype = 'exact' if not entry['ENST']: self.printLog( '\r#ING', 'Unable to find Harrington start for %s %s (%s)' % (gene, entry[icod], entry[icon]), screen=False) fx += 1 continue elif len(peptseq) < minpep: self.printLog( '\r#ING', 'Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene, entry[icod], entry[icon]), screen=False) fx += 1 continue id = rje.preZero(int(ex / 100), etot) entry['ENSI'] = 'ENSINGT%s' % id entry['ENSP'] = 'ENSINGP%s' % id ENST.write( '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id, mtype, entry['ENST'], mentry['EnsEMBL'], entry['Gene'], mentry['Gene'], cdnaseq)) ENSP.write( '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id, entry['Gene'], mentry['Gene'], peptseq)) sx += 1 sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile()) ENST.close() ENSP.close() self.printLog( '\r#ING', 'Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx), rje.iStr(fx))) return except: self.errorLog('%s.method error' % self)
def peptAlign(self,regex=None,peptides=[],peptdis=None,termini=None,save=False): ### Align peptides using regular expression ''' Align peptides using regular expression. >> regex:str [None] = Regular expression to use for alignment of peptides. >> peptides:list [] = List of peptides to align using regex. >> peptdis:str [None] = Peptide distance method to use first. >> termini:bool [None] = Whether peptides for alignment have termini (^ & $) or X flanking regex match. >> save:bool [True] = Whether to save to ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.obj['PeptDis']: self.setup() failx = 0 # Number of failures ## ~ [0a] Setup method attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not peptides: peptides = self.list['Peptides'] if not regex: if self.getStrLC('PeptAlign'): regex = self.getStrUC('PeptAlign') else: return peptides[0:] if termini == None: termini = self.getBool('Termini') if not peptdis: peptdis = self.getStrLC('PeptDis') ## ~ [0b] Setup SLiM and alignment attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if regex in ['T','TRUE']: # SLiM-free alignment maxlen = 0; maxgapx = 0 slimvar = {} # Dictionary of {gapx:gap pos} for pept in peptides: maxlen = max(maxlen,len(pept)) for pept in peptides: maxgapx = max(maxgapx,maxlen-len(pept)) maxgapx = min(maxgapx,self.getInt('MaxGapX')) for gapx in range(1,maxgapx+1): slimvar[gapx] = [] peptlen = maxlen - gapx if termini: gapcombos = rje.listCombos([range(peptlen)[1:-1]] * gapx,maxrep=self.getInt('MaxGapVar')) else: gapcombos = rje.listCombos([range(peptlen)] * gapx,maxrep=self.getInt('MaxGapVar')) for gapvar in gapcombos[0:]: gapvar.sort(); gapvar.reverse() if gapvar not in slimvar[gapx]: slimvar[gapx].append(gapvar) self.printLog('#GAPX','PeptLen: %d; MaxLen: %d; Termini: %s => %s x %d gap variants.' % (peptlen,maxlen,termini,rje.iLen(slimvar[gapx]),gapx)) #slimvar = range(maxlen) #if termini: slimvar = slimvar[1:-1] # All possible positions for gaps. Generate combos as needed. else: #!# Need to deal with multiple regex?! (Use one with most matches and only keep that one?!) if rje_slim.needToSplitPattern(regex): splits = rje_slim.splitPattern(regex) self.printLog('#SPLIT','%s => %s' % (regex,string.join(splits,' | '))) newregex = ''; bestpep = [] for regsplit in splits: regexpep = [] for pept in peptides: if termini and rje.matchExp('(%s)' % regsplit,pept[1:-1]): regexpep.append(pept) elif not termini and rje.matchExp('(%s)' % regsplit,pept): regexpep.append(pept) if len(regexpep) > len(bestpep): bestpep = regexpep[0:]; newregex = regsplit self.printLog('#REGEX','%s => %s (%d/%d peptides)' % (regex,newregex,len(bestpep),len(peptides))) regex = newregex for pept in peptides[0:]: if pept not in bestpep: self.warnLog('%s does not match %s!' % (pept,regex)); peptides.remove(pept); failx += 1 slim = rje_slim.slimFromPattern(regex) self.printLog('#GUIDE','SLiM Guide: %s' % slim) slimpos = string.split(slim,'-') maxlen = rje_slim.slimLen(slim) if regex.startswith('^'): maxlen -= 1 if regex.endswith('$'): maxlen -= 1 slimvar = [] # Make variants of SLiMs (wildcard spacers only) w = 1 while len(slimpos) > w: slimvar.append(slimpos[w]); w += 2 # Add wildvar spacers only maxvar = [] for var in slimvar: maxvar.append(var[-1]) # Smallest number of wildcards: used to assess - to add slimvar = rje.listCombos(slimvar) # Returns all possible combinations, used for building variants if termini: maxlen += 2 ## ~ [0c] Setup Peptide Distance methods ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## dismethods = ['id','prop','pam'] if peptdis: try: dismethods.insert(0,dismethods.pop(dismethods.index(peptdis))) except: self.warnLog('PeptDis method "%s" not recognised.' % peptdis) ### ~ [1] Cycle through peptides and make variants ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### variants = {} # Dictionary of {peptide:[variants]} singletons = [] # Peptides with single variants for pept in peptides[0:]: self.progLog('\r#VAR','%s peptides: %s singletons; %s with possible variants.' % (rje.iLen(peptides),rje.iLen(singletons),rje.iLen(variants))) variants[pept] = [] ## ~ [1a] ~ Make list of peptide length variants adding - at all possible positions ~ ## if regex in ['T','TRUE']: # SLiM-free alignment gapx = maxlen - len(pept) if gapx > self.getInt('MaxGapX'): self.warnLog('Peptide %s exceeds MapGapX=%d; rejected.' % (pept,self.getInt('MaxGapX'))) peptides.remove(pept) continue self.bugPrint(slimvar) self.debug('%s: %s vs %s = %d' % (pept,len(pept),maxlen,gapx)) if gapx: # Try all gap combinations for gapvar in slimvar[gapx]: #rje.listCombos([slimvar] * gapx): peptvar = pept for gap in gapvar: peptvar = peptvar[:gap] + '-' + peptvar[gap:] if peptvar not in variants[pept]: variants[pept].append(peptvar) else: variants[pept] = [pept] ## ~ [1b] ~ Make list of peptide length variants, adding - to regex wildvar positions ## else: for var in slimvar: peptvar = '' # Add new variant i = 0 if termini: peptvar += pept[i]; i += 1 if regex[0] != '^': peptvar += pept[i]; i += 1 for wi in range(len(var)): wy = int(maxvar[wi]); wx = int(var[wi]) if wx: peptvar += pept[i:i+wx]; i += wx peptvar += '-' * (wy - wx) # Add a number of gaps equal to maxvar for same position minus slimvar if i >= len(pept): break peptvar += pept[i]; i += 1 # Keep variants that match regex and maxlen if termini: if regex[-1] != '$': if i < len(pept): peptvar += pept[i] i += 1 rmatch = rje.matchExp('(%s)' % regex,peptvar[1:-1]) self.bugPrint('%s vs %s: %s' % (regex,peptvar[1:-1],rmatch)) self.debug('%s vs %s and %s vs %s' % (i,len(pept),len(peptvar),maxlen)) keepvar = i == len(pept) and rmatch and len(peptvar) == maxlen else: keepvar = i == len(pept) and rje.matchExp('(%s)' % regex,peptvar) and len(peptvar) == maxlen self.bugPrint('%s %s: %s x %s => %s = %s' % (regex,slim,var,pept,peptvar,keepvar)) if keepvar and peptvar not in variants[pept]: variants[pept].append(peptvar) ## ~ [1c] ~ Check Peptide variants ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if len(variants[pept]) == 1: singletons.append(variants.pop(pept)[0]) elif not variants[pept]: self.warnLog('No %s variants match %s!' % (pept,regex)); variants.pop(pept); failx += 1 self.printLog('#VAR','%s peptides: %s singletons; %s with possible variants.' % (rje.iLen(peptides),rje.iLen(singletons),rje.iLen(variants))) self.debug(singletons) self.debug(variants) ### ~ [2] ~ Sort peptides by increasing numbers of variants ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### # For 2+ variants, rank by mean PeptDist versus single variants # Keep best (including ties) and cycle # Iterate until no more variants filtered # If variants remain, switch score method and iterate again # If variants remain after all score methods, keep first variant peptdis = dismethods.pop(0) while variants: self.progLog('\r#VAR','%s peptide tidy: %s singletons; %s with variants.' % (rje.iLen(peptides),rje.iLen(singletons),rje.iLen(variants))) if self.obj['PeptDis']: self.obj['PeptDis'].dict['Matrix'] = {} comppept = singletons[0:] prevarx = 0; postvarx = 0 if not comppept: comppept = rje.listJoin(variants.values(),sortunique=True) for pept in rje.sortKeys(variants): self.progLog('\r#VAR','%s peptide tidy: %s singletons; %s with variants.' % (rje.iLen(peptides),rje.iLen(singletons),rje.iLen(variants))) scores = {}; prevarx += len(variants[pept]) for peptvar in variants[pept]: dis = 0.0 for pep2 in comppept: if termini: dis += self.peptDist(peptvar[1:-1],pep2[1:-1],peptdis) else: dis += self.peptDist(peptvar,pep2,peptdis) dis /= len(comppept) if dis not in scores: scores[dis] = [] scores[dis].append(peptvar) variants[pept] = scores.pop(rje.sortKeys(scores)[0]) # Keep lowest scoring variant(s) if len(variants[pept]) == 1: singletons.append(variants.pop(pept)[0]) else: postvarx += len(variants[pept]) self.printLog('#PDIS','%s distances: %s => %s variants.' % (peptdis,prevarx,postvarx)) if prevarx == postvarx: if dismethods: peptdis = dismethods.pop(0) else: break self.printLog('#VAR','%s peptides tidied: %s singletons; %s with variants.' % (rje.iLen(peptides),rje.iLen(singletons),rje.iLen(variants))) if variants: self.warnLog('Unable to select all variants using distances.') self.printLog('#VAR','Arbitrary variants picked for %s peptides' % rje.iLen(variants)) for pept in rje.sortKeys(variants): singletons.append(variants.pop(pept)[0]) ### ~ [3] Remove 100% gapped positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for i in range(maxlen-1,-1,-1): degap = True for pept in singletons: if pept[i] != '-': degap = False; break if degap: for p in range(len(singletons)): singletons[p] = singletons[p][:i] + singletons[p][i+1:] ### ~ [4] Save and/or return peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if save: open('%s.aligned.txt' % self.baseFile(),'w').write(string.join(singletons,'\n')) self.printLog('#OUT','%s aligned peptides output to %s.aligned.txt' % (rje.iLen(singletons),self.baseFile())) return singletons except: self.errorLog('%s.peptAlign error' % self); raise
def forking( self ): ### Keeps forking out and processing jobs until no more jobs in self.list['Forked']. '''Keeps forking out and processing jobs until no more jobs in self.list['Forked'].''' ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getBool('PIDCheck') or self.dev(): pidcheck = '%s.pid' % rje.baseFile( self.log.info['LogFile']) # Set *.pid object to match log else: pidcheck = None #self.deBug(pidcheck) ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while self.list['Forked']: if not self.getBool('LogFork'): self.progLog( '\r#FORK', 'Forking jobs: {0} running; {1} remain.'.format( len(self.list['Forked']), rje.iLen(self.list['ToFork']))) if pidcheck: PIDCHECK = open(pidcheck, 'w') for fdict in self.list['Forked'][0:]: try: pid = fdict['PID'] if pidcheck: PIDCHECK.write('%s: %s\n' % (self.list['Forked'].index(fdict), pid)) if string.split('%s' % pid)[0] == 'WAIT': status = 1 else: (status, exit_stat) = os.waitpid(pid, os.WNOHANG) except: self.errorLog('!') status = 1 if status > 0: self.list['Forked'].remove(fdict) self.endFork( fdict ) # Fork has finished: can replace with processing if pidcheck: PIDCHECK.close() #self.deBug(open(pidcheck,'r').read()) ## ~ [2a] Look for eternal hanging of threads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if time.time() - self.getNum('KillTime') > self.getNum( 'KillForks'): self.verbose( 0, 1, '\n%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'), len(self.list['Forked'])), 1) for fdict in self.list['Forked']: self.verbose( 0, 2, ' => Fork %s, PID %d still Active!' % (fdict['ID'], fdict['PID']), 1) if (self.i() < 0 and self.getBool('KillMain')) or rje.yesNo( 'Kill Main Thread?', default={ True: 'N', False: 'Y' }[self.getBool('KillMain')]): raise ValueError( '%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'), len(self.list['Forked']))) elif self.i() < 0 or rje.yesNo('Kill hanging forks?'): self.printLog( '#KILL', 'KillForks=%d seconds walltime reached.' % (self.getNum('KillForks'))) for fdict in self.list['Forked']: self.printLog( '#KILL', 'Killing Fork %s, PID %d.' % (fdict['ID'], fdict['PID'])) os.system('kill %d' % fdict['PID']) else: self.setNum({'KillTime': time.time()}) ## ~ [2b] Sleep ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## time.sleep(self.getNum('ForkSleep'))
def _peptideProbabilities(self): ### Read in peptides and positives and calculate probability of return '''Read in peptides and positives and calculate probability of return.''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getBool('CysWeight'): return self._cysteinePeptideProbabilities() self._positiveAndNegativePeptides() #return self.printLog('#NOPROB','Probability calculation temporarily suspended') pfile = '%s.pep_prob.tdt' % self.basefile() if rje.exists(pfile) and not self.getBool('Force'): try: pdb = self.db().addTable(pfile,['PepSize'],name='PepProb') pdb.dataFormat(reformat={'PepSize':'num','Positive':'int','Negative':'int','Prob':'num'}) for entry in pdb.entries(): if entry['PepSize'] < 100: entry['PepSize'] = int(entry['PepSize']) return pdb except: pass pdb = self.db().addEmptyTable('PepProb',['PepSize','Positive','Negative','Prob'],['PepSize']) if not rje.exists(self.getStr('Peptides')) or not rje.exists(self.getStr('Positives')): return False ## ~ [0a] ~ Load Peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## peplist = self.loadFromFile(self.getStr('Peptides'),chomplines=True) ## ~ [0b] ~ Load Positives ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % self.getStr('Positives'),'autoload=T']) ### ~ [1] ~ Digest Positives and Update PepProb Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### protease = self.getStr('PepCut') ## ~ [1a] ~ Create new database entry to fill with data ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## edict = {} for i in range(1,self.getInt('MaxPepLen')+1): edict[i] = pdb.addEntry({'PepSize':i,'Positive':0,'Negative':0,'Prob':1.0}) if self.getBool('PepMWt'): edict[i*100.0] = pdb.addEntry({'PepSize':i*100.0,'Positive':0,'Negative':0,'Prob':1.0}) ## ~ [1b] ~ For each recognition site of each protease, mark cuts with ":" ~~~~~~~~ ## poslist = []; neglist = []; sx = 0.0; stot = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#PEP','Processing positive proteins (%s): %.2f%%' % (protease,sx/stot)); sx += 100.0 sequence = seq.getSequence() for cut in proteases[protease]: sequence = string.join(string.split(sequence,string.replace(cut,':','')),cut) frag = string.split(sequence,':') while '' in frag: frag.remove('') if not self.getBool('NTerm'): frag = frag[1:] for pep in frag[0:]: if self.getBool('NRPep') and pep in self.list['Redundant']: continue if pep not in poslist: poslist.append(pep) self.printLog('\r#PEP','Processed positive proteins (%s): %s peptides' % (protease,rje.iLen(poslist))) ## ~ [1c] ~ Process fragments ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## px = 0.0; ptot = len(poslist) for pep in poslist[0:]: self.progLog('\r#PEP','Processing positive peptides (%s): %.2f%%' % (protease,px/ptot)); px += 100.0 plen = min(len(pep),self.getInt('MaxPepLen')) if pep in peplist: edict[plen]['Positive'] += 1 else: edict[plen]['Negative'] += 1; poslist.remove(pep); neglist.append(pep) if self.getBool('PepMWt'): pwt = 100.0 * min(int((rje_sequence.MWt(pep)+99)/100.0),self.getInt('MaxPepLen')) if pep in peplist: edict[pwt]['Positive'] += 1 else: edict[pwt]['Negative'] += 1 self.printLog('\r#PEP','Processing positive peptides (%s) complete.' % protease) ## ~ [1d] # Calculate peptide probabilities for protease combo ~~~~~~~~~~~~~~~~~~~~ ## for entry in edict.values(): try: entry['Prob'] = float(entry['Positive']) / float(entry['Positive']+entry['Negative']) except: entry['Prob'] = 0.0 ### ~ [2] ~ Save File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdb.saveToFile(pfile) return pdb except: self.errorLog('Problem during %s._peptideProbabilities().' % self); return None # Setup failed
def pileUpStats(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### statfile = '%s.pdiff.tdt' % self.baseFile() if not self.force() and os.path.exists(statfile): return self.pileUpFDR() ## ~ [0a] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## wtdata = {} # Load lists of data for compiling for locus in self.dict['RefSeq']: wtdata[locus] = {} for field in ['N','QN','MajFreq']: wtdata[locus][field] = [] WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 1 fields = [] for line in WTDATA: data = rje.readDelimit(line) if fields: locus = data[0] pos = int(data[1]) while pos > wx: wtdata[locus]['N'].append(0); wtdata[locus]['QN'].append(0); wtdata[locus]['MajFreq'].append(0.0); wx += 1 for field in ['N','QN']: wtdata[locus][field].append(int(data[fields.index(field)])) for field in ['MajFreq']: wtdata[locus][field].append(string.atof(data[fields.index(field)])) wx += 1 else: fields = data[0:] WTDATA.close() ## ~ [0b] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mutdata = {} # Load lists of data for compiling for locus in self.dict['RefSeq']: mutdata[locus] = {} for field in ['N','QN','Major','MajFreq','WTFreq']: mutdata[locus][field] = [] MUTDATA = open('%s.Mut.tdt' % self.baseFile(),'r'); mx = 1 fields = [] for line in MUTDATA: data = rje.readDelimit(line) if fields: locus = data[0] self.str['RefSeq'] = self.dict['RefSeq'][locus] pos = int(data[1]) try: if pos > len(self.str['RefSeq']): while (pos-1) > len(self.str['RefSeq']): self.str['RefSeq'] += '?' self.str['RefSeq'] += data[2] self.dict['RefSeq'][locus] = self.str['RefSeq'] elif self.str['RefSeq'][pos-1] == '?': self.str['RefSeq'] = self.str['RefSeq'][:pos-1] + data[2] + self.str['RefSeq'][pos:] self.dict['RefSeq'][locus] = self.str['RefSeq'] except: self.warnLog('Problem mapping Pos %s onto %snt %s RefSeq' % (rje.iStr(pos),locus,rje.iLen(self.str['RefSeq']))) while pos > mx: mutdata[locus]['N'].append(0); mutdata[locus]['QN'].append(0); mutdata[locus]['Major'].append('-'); mutdata[locus]['MajFreq'].append(0.0); mutdata[locus]['WTFreq'].append(0.0); mx += 1 for field in ['N','QN']: mutdata[locus][field].append(int(data[fields.index(field)])) for field in ['MajFreq','WTFreq']: mutdata[locus][field].append(string.atof(data[fields.index(field)])) for field in ['Major']: mutdata[locus][field].append(data[fields.index(field)]) mx += 1 else: fields = data[0:] MUTDATA.close() ## ~ [0c] Integrity check ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Need a new check with locus info #!# #for field in wtdata: #!# Won't be true - not all reference genome positions present in output (0 mapped reads) # if len(wtdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for WT %s' % field,printerror=False); raise ValueError #for field in mutdata: #!# Won't be true - not all reference genome positions present in output (0 mapped reads) # if len(mutdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for Mutant %s' % field,printerror=False); raise ValueError #self.printLog('#REF','WT and Mutant data for %s reference positions' % rje.iLen(self.str['RefSeq'])) ### ~ [1] Assess and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'w') headers = ['Locus','Pos','Ref','WT.N','WT.QN','WT.Major','WT.MajFreq','Mut.N','Mut.QN','Mut.Major','Mut.MajFreq','Mut.WTFreq','p.Over','p.Under','p.Diff'] SAMSIG.write('%s\n' % string.join(headers,'\t')) nodifx = 0; nomutx = 0; sx = 0 for locus in rje.sortKeys(self.dict['RefSeq']): self.str['RefSeq'] = self.dict['RefSeq'][locus] self.list['WTMajor'] = self.dict['WTMajor'][locus] for i in range(len(self.str['RefSeq'])): try: sigdata = [locus,i+1,self.str['RefSeq'][i],wtdata[locus]['N'][i],wtdata[locus]['QN'][i],self.list['WTMajor'][i],wtdata[locus]['MajFreq'][i], mutdata[locus]['N'][i],mutdata[locus]['QN'][i],mutdata[locus]['Major'][i],mutdata[locus]['MajFreq'][i],mutdata[locus]['WTFreq'][i]] except: self.warnLog('Incomplete data for %s:%s (no pdiff output)' % (locus,rje.iStr(i+1))); continue if self.getBool('MajDif') and self.list['WTMajor'][i] == mutdata[locus]['Major'][i]: nodifx += 1; continue # Was: sigdata += [1.0,1.0] elif self.getBool('MajMut') and self.str['RefSeq'][i] == mutdata[locus]['Major'][i]: nomutx += 1;continue elif not wtdata[locus]['MajFreq'][i]: # No Data for WT if mutdata[locus]['WTFreq'][i]: sigdata += [0.0,1.0] else: sigdata += [1.0,1.0] elif mutdata[locus]['WTFreq'][i] > wtdata[locus]['MajFreq'][i]: obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5) sigdata.append(rje.binomial(obs,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self)) sigdata.append(1.0) elif mutdata[locus]['WTFreq'][i] < wtdata[locus]['MajFreq'][i]: obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5) sigdata.append(1.0) sigdata.append(1.0 - rje.binomial(obs+1,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self)) else: sigdata += [1.0,1.0] sigdata.append(min(1.0,2*min(sigdata[-2:]))) rje.writeDelimit(SAMSIG,sigdata); sx += 1 SAMSIG.close() ptxt = '%s lines output to *.pdiff.txt' % rje.iStr(sx) if self.getBool('MajDif'): ptxt += '; %s positions skipped where WTMajor==MutMajor (majdif=T)' % rje.iStr(nodifx) if self.getBool('MajMut'): ptxt += '; %s positions skipped where Ref==MutMajor (majmut=T)' % rje.iStr(nomutx) self.printLog('#PDIFF','%s.' % ptxt) ### ~ [2] FDR Correction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.pileUpFDR() except: self.errorLog('%s.pileUpStats() error' % (self)); return None