def makeGOFile(self): ### Maps GO to sequences and outputs table for R analysis '''Maps GO to sequences and outputs table for R analysis.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outfile = '%s.goer.tdt' % self.info['ResFile'] headers = ['GOID','Motif','Type','Gene','Cons','HomNum','GlobID','LocID','Hyd','SA'] rje.delimitedFileOutput(self,outfile,headers,rje_backup=True) ### ~ [2] ~ Work through dictionary and output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (mx,mtot) = (-100.0,len(self.dict['Occ'])) for motif in rje.sortKeys(self.dict['Occ']): mx += 100.0; self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|CheckSeq) ' % (outfile,(mx/mtot),motif)) ## ~ [2a] ~ Check MinOcc in terms of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for type in rje.sortKeys(self.dict['Occ'][motif]): if len(self.dict['Occ'][motif][type]) < self.stat['MinOcc']: self.dict['Occ'][motif].pop(type) if 'ELM' not in self.dict['Occ'][motif] or len(self.dict['Occ'][motif]) < 2: continue for type in self.dict['Occ'][motif]: ## ~ [2b] ~ Map GO terms and check MinOcc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|Check%s) ' % (outfile,(mx/mtot),motif,type)); godict = {} # Temp dictionary of {GOID:[Seqs]} for gene in self.dict['Occ'][motif][type]: for go in self.ensGO(gene): if go not in godict: godict[go] = [gene] else: godict[go].append(gene) self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|OccGO%s) ' % (outfile,(mx/mtot),motif,type)); for go in rje.sortKeys(godict): if len(godict[go]) < self.stat['MinOcc']: godict.pop(go) ## ~ [2c] ~ Output remaining GO terms occurrences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|Output%s)' % (outfile,(mx/mtot),motif,type)); for go in rje.sortKeys(godict): for gene in godict[go]: for occdict in self.dict['Occ'][motif][type][gene]: datadict = rje.combineDict({'GOID':'GO:%s' % go,'Motif':motif,'Type':type,'Gene':gene},occdict) rje.delimitedFileOutput(self,outfile,headers,datadict=datadict) self.printLog('#OUT','Output for %s %s complete.' % (motif,rje.sortKeys(self.dict['Occ'][motif])),screen=False) self.printLog('\r#OUT','Generating %s output complete! ' % (outfile)) except: self.log.errorLog(rje_zen.Zen().wisdom())
def saveFasta(self): ### Outputs parsed PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Datasets/') rje.mkDir(self, datpath) ## Check Seqs ## for p1 in rje.sortKeys(self.dict['PPI']): if 'Seq' not in self.dict['HPRD'][p1]: #!# KeyError #!# print p1, self.dict['HPRD'][p1] self.deBug('No Seq for %s' % p1) ### All sequences ### self.obj['SeqList'].saveFasta() ### Output PPI Datasets ### for p1 in rje.sortKeys(self.dict['PPI']): mylist = [] for p2 in self.dict['PPI'][p1]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath, self.dict['HPRD'][p1]['gene']) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile) self.log.printLog('#FAS', 'HPRD PPI fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()', printerror=True, quitchoice=False)
def ddi(self): ### Domain-domain interactions '''Domain-domain interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ddx = 0 (dx,dtot) = (0.0,len(self.dict['DDI'])) if not self.dict['DDI'] or not self.dict['Domain']: return ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['DDI']): self.progLog('\r#DDI','Screening domain-domain interactions: %.1f%%; %s removed' % ((dx/dtot),rje.integerString(ddx))); dx += 100 if dom not in self.dict['Domain']: self.printLog('#DOM','No sequences with "%s" domains' % dom); continue for ddi in self.dict['DDI'][dom]: if ddi not in self.dict['Domain']: continue for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in self.dict['Domain'][ddi]: ddx+=1; self.dict['PPI'][hub].remove(spoke) for hub in self.dict['Domain'][ddi]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in self.dict['Domain'][dom]: ddx+=1; self.dict['PPI'][hub].remove(spoke) self.printLog('\r#DDI','Screening domain-domain interactions complete: %s removed.' % (rje.integerString(ddx))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DDI','No %s interactions left after DDI removed' % hub,screen=False) self.printLog('#PPX','%s of %s PPI hubs remain after DDI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.ddi()',quitchoice=True)
def dpi(self): ### Domain-protein interactions '''Domain-protein interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_DPI' rje.mkDir(self, outdir) dpi = {} # Dictionary of {domain:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['Domain']): dpi[dom] = [] for hub in self.dict['Domain'][dom]: if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][ hub] # Add with redundancy for spoke in dpi[dom][0:]: if dpi[dom].count(spoke) == 1: dpi[dom].remove( spoke) # Must have 2+ domain interactions for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in dpi[dom]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict[ 'PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) dpi[dom] = rje.sortUnique(dpi[dom], False, False) acc = [] for name in dpi[dom]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.dpi.acc' % (outdir, dom), 'w').write(string.join(acc, '\n')) self.printLog('#DPI', '%s domain => %d interactors' % (dom, len(acc))) if badname: badname.sort() self.printLog( '#BAD', '%d "bad" protein names: %s' % (len(badname), string.join(badname, '; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DPI', 'No %s PPI left after DPI removed' % hub, screen=False) self.printLog( '#PPX', '%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len( self.dict['PPI'])), rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.dpi()', quitchoice=True)
def readPELM(self): ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc. '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.''' try:### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### data = rje.dataDict(self,self.info['PELM'],mainkeys=['acc','position']) seqdict = {} # Dictionary of Acc:Sequence ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdict = self.dict['PhosphoSites'] for dkey in data: ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (acc,pos) = string.split(dkey) pos = string.atoi(pos) if acc not in pdict: pdict[acc] = {} if pos not in pdict[acc]: pdict[acc][pos] = {} ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if acc not in seqdict: seqdict[acc] = data[dkey]['sequence'] elif seqdict[acc] != data[dkey]['sequence']: self.log.printLog('#ERR','Warning. Sequence mismatch for %s' % acc) if 'aa' not in pdict[acc][pos]: pdict[acc][pos]['aa'] = data[dkey]['code'] elif pdict[acc][pos]['aa'] != data[dkey]['code']: self.log.printLog('#ERR','Warning. PhosphoSite mismatch for %s at pos %d: %s not %s' % (acc,pos,data[dkey]['code'],pdict[acc][pos]['aa'])) if data[dkey]['code'] != seqdict[acc][(pos-1):pos]: self.log.printLog('#ERR','Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s' % (acc,pos,data[dkey]['code'],seqdict[acc][pos-1:pos])) ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acclist = rje.sortKeys(seqdict) pelmuni = rje_uniprot.UniProt(self.log,self.cmd_list) # UniProt entry unidict = pelmuni.accDict(acclist) # Dictionary of {acc:UniProtEntry} pelmseq = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None']) # SeqList object ## ~ [3b] Add one sequence for each AccNum and update seqdict ~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!# for acc in acclist: #!# Make accdict of {acc:Seq} using unidict and seqlist #!# sequence = seqdict[acc] try: uni = unidict[string.split(acc,'-')[0]] desc = uni.obj['Sequence'].info['Description'] name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'],acc,desc) if sequence != uni.obj['Sequence'].info['Sequence']: self.log.printLog('#WARNING','Sequence mismatch for UniProt entry %s' % acc) except: self.log.errorLog('Problem with %s' % acc) name = '%s_UNK__%s' % (acc,acc) #!# Add sequences where UniProt missing #!# seqdict[acc] = pelmseq._addSeq(name,sequence) ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['FilterSeq']: pelmseq.autoFilter() for acc in acclist: if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc) acclist = rje.sortKeys(seqdict) ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not os.path.exists(self.info['PELMFas']) or self.stat['Interactive'] < 0 or rje.yesNo('%s exists: overwrite?' % self.info['PELMFas']): pelmseq.saveFasta(seqfile=self.info['PELMFas']) self.obj['SeqList'] = pelmseq self.obj['UniProt'] = pelmuni except: self.log.errorLog('Problem during PhosphoSeq.readPELM')
def setup(self): ### Main class setup method. Makes sumfile if necessary. '''Main class setup method. Makes sumfile if necessary.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile')) if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile'])) if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile() self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile')) if os.path.exists(self.info['SumFile']) and not self.opt['Force']: if rje.yesNo('%s found. Use these results?' % self.info['SumFile']): return self.printLog('#SUM','Summary results file found. No MASCOT processing.') mapgi = False ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for mfile in self.list['ResFiles']: bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile]) bud.info['Name'] = mfile bud.readMascot() self.dict['Searches'][mfile] = bud.dict['Hits'] protacclist = rje.sortKeys(bud.dict['Hits']) for protacc in protacclist: if rje.matchExp('gi\|(\d+)',protacc): mapgi = True accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile)) self.debug(accfile) open(accfile,'w').write(string.join(protacclist,'\n')) self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist))) ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #if mapgi: # mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI') # open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n')) ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list) self.dict['Acc2Seq'] = seqlist.seqNameDic('Max') ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',') rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True) for mfile in rje.sortKeys(self.dict['Searches']): bud = self.dict['Searches'][mfile] for protacc in rje.sortKeys(bud)[0:]: protname = bud[protacc]['prot_acc'] protdesc = bud[protacc]['prot_desc'] if rje.matchExp('gi\|(\d+)',protacc): gi = rje.matchExp('gi\|(\d+)',protacc)[0] try: protname = self.dict['Acc2Seq'][gi].shortName() protdesc = self.dict['Acc2Seq'][gi].info['Description'] except: protname = 'gi_UNK__%s' % gi #x#print protname, protdesc, bud[protacc] for pep in bud[protacc]['Peptides']: data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname, 'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']} rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data) except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def ddi(self): ### Domain-domain interactions '''Domain-domain interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ddx = 0 (dx, dtot) = (0.0, len(self.dict['DDI'])) if not self.dict['DDI'] or not self.dict['Domain']: return ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['DDI']): self.progLog( '\r#DDI', 'Screening domain-domain interactions: %.1f%%; %s removed' % ((dx / dtot), rje.integerString(ddx))) dx += 100 if dom not in self.dict['Domain']: self.printLog('#DOM', 'No sequences with "%s" domains' % dom) continue for ddi in self.dict['DDI'][dom]: if ddi not in self.dict['Domain']: continue for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in self.dict['Domain'][ddi]: ddx += 1 self.dict['PPI'][hub].remove(spoke) for hub in self.dict['Domain'][ddi]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in self.dict['Domain'][dom]: ddx += 1 self.dict['PPI'][hub].remove(spoke) self.printLog( '\r#DDI', 'Screening domain-domain interactions complete: %s removed.' % (rje.integerString(ddx))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DDI', 'No %s interactions left after DDI removed' % hub, screen=False) self.printLog( '#PPX', '%s of %s PPI hubs remain after DDI removed' % (rje.integerString(len( self.dict['PPI'])), rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.ddi()', quitchoice=True)
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Basefile'].lower() in ['','none']: self.info['Basefile'] = '' elif self.info['Basefile'][-1] != '.': self.info['Basefile'] += '.' self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T']) self.list['PlotFT'] = string.split(string.join(self.list['PlotFT']).upper()) if self.info['OccFile'].lower() not in ['','none']: self.info['Delimit'] = rje.delimitFromExt(filename=self.info['OccFile']) self.dict['OccData'] = {} occdata = rje.dataDict(self,self.info['OccFile'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'],['Seq','Dataset','Pattern','Start_Pos','End_Pos']) for key in rje.sortKeys(occdata): seq = occdata[key].pop('Seq') if seq not in self.dict['OccData']: self.dict['OccData'][seq] = {} dataset = occdata[key].pop('Dataset') if dataset not in self.dict['OccData'][seq]: self.dict['OccData'][seq][dataset] = [] self.dict['OccData'][seq][dataset].append(occdata[key]) self.printLog('#OCC','Loaded data for %s occurrences in %s sequences' % (rje.integerString(len(occdata)),rje.integerString(len(self.dict['OccData'])))) self.obj['SeqList'].autoFilter(['GoodSeq=%s' % string.join(rje.sortKeys(self.dict['OccData']),',')]) ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['PlotStat'] = string.split(string.join(self.list['PlotStat']).lower()) if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc = rje_slimcalc.SLiMCalc(self.log,self.cmd_list) seqdict = self.obj['SeqList'].seqNameDic() for name in rje.sortKeys(seqdict): if self.opt['OccOnly'] and not name in self.dict['OccData']: continue seq = seqdict[name] sequence = seq.getSequence(gaps=False) seq.dict['PlotStat'] = {} if 'sa' in self.list['PlotStat']: seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(sequence,returnlist=True) if 'hyd' in self.list['PlotStat']: seq.dict['PlotStat']['Hydropathy'] = rje_seq.eisenbergHydropathy(sequence,returnlist=True) if 'dis' in self.list['PlotStat']: seq.dict['PlotStat']['Disorder'] = seq.disorder(returnlist=True) if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc.relConListFromSeq(seq,slimcalc.stat['RelConWin'],store=True) try: seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons') seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop('RelCons') except: self.printLog('#CONS','No conservation stats for %s' % name) self.printLog('#STAT','PlotStats calculated for %s' % name) for stat in seq.dict['PlotStat']: if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0: seq.dict['PlotStat'][stat] = self.plotWin(seq.dict['PlotStat'][stat]) seq.dict['PlotStat'][stat] = self.convertStat(seq.dict['PlotStat'][stat]) self.printLog('#STAT','PlotStats converted for %s' % name) ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if name in self.dict['OccData']: for dataset in self.dict['OccData'][name]: ofile = '%s%s.%s.plot.txt' % (self.info['Basefile'],dataset,seq.info['AccNum']) self.output(seq,ofile,self.dict['OccData'][name][dataset]) else: self.output(seq,'%s%s.plot.txt' % (self.info['Basefile'],seq.info['AccNum'])) return except: self.errorLog(rje_zen.Zen().wisdom())
def setupCustomScores( callobj, statlist=[], scorelist=[], scoredict={} ): ### Sets up Custom Scores using existing statlist """ Sets up Custom Scores using existing statlist. >> callobj:RJE_Object [None] = calling object for Error Messages etc. >> statlist:list of stats that are allowed for custom score. Generally column headers for output. >> scorelist:list of Custom Score Names in order they were read in (may use prev. scores) >> scoredict:dictionary of Custom Scores: {Name:Formula} << (statlist,scorelist,scoredict):(list,list,dictionary) of acceptable Custom Scores ([Stats],[Names],{Name:Formula}) """ try: ### Setup Custom Scores ### if not scorelist: scorelist = rje.sortKeys(scoredict) for new in scorelist[0:]: # self.dict['NewScore'] keys() in order they were read in if new in statlist: callobj.log.errorLog('Score "%s" exists: custom score cannot be made.' % (new), printerror=False) scorelist.remove(new) scoredict.pop(new) continue if not rje.formula(callobj, formula=scoredict[new], varlist=statlist[0:], check=True, calculate=False): callobj.log.errorLog('Custom score "%s" cannot be made.' % (new), printerror=False) scorelist.remove(new) scoredict.pop(new) continue statlist.append(new) return (statlist, scorelist, scoredict) ### Returns same things given ### except: callobj.log.errorLog("Problem during rje_scoring.setupCustomScores()", quitchoice=True) return scoredict
def loadFeatures(self,ftfile): ### Loads features from given file '''Loads features from given file.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if ftfile in ['','none']: return if not os.path.exists(ftfile): return self.printLog('#ERR','Features file "%s" missing') delimit = rje.delimitFromExt(filename=ftfile) ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = rje.readDelimit(open(ftfile,'r').readline(),delimit) mainkeys = [headers[0]] hmap = {} for h in headers: hmap[h.lower()] = h pos = '' # Leader for start/end positions if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_' for h in ['feature','%sstart' % pos,'%send' % pos,'description']: if h not in hmap: return self.printLog('#ERR','No %s field detected in "%s" features file' % (h,ftfile)) mainkeys.append(hmap[h]) mainkeys.remove(hmap['description']) ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ftdata = rje.dataDict(self,ftfile,mainkeys,['description'],delimit,headers,lists=True) (mx,mtot,fx) = (0.0,len(ftdata),0) for mainkey in rje.sortKeys(ftdata): self.progLog('\r#FT','Loading features from %s: %.2f%%' % (ftfile,mx/mtot)) mx += 100.0 (id,ft,start,end) = string.split(mainkey,delimit) if id == mainkeys[0]: continue if id not in self.dict['Features']: self.dict['Features'][id] = [] for desc in ftdata[mainkey][hmap['description']]: fx += 1 self.dict['Features'][id].append({'Type':ft,'Start':int(start),'End':int(end),'Desc':desc}) self.printLog('\r#FT','Loaded %s features for %s IDs from %s' % (rje.integerString(fx),rje.integerString(len(self.dict['Features'])),ftfile)) except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
def output(self,seq,outfile,occdata=[]): ### Output to file '''Output to file.''' try:### ~ [1] ~ Basic Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['OccOnly'] and not occdata: return odata = ['Name\t%s' % (seq.shortName()), 'Sequence\t%s' % (seq.getSequence(gaps=False)), 'Output\t%s' % (string.join(string.split(outfile,'.')[:-1],'.')), 'RE\t%s' % (string.join(self.list['PlotRE'],',')), 'TrueELMs\tY', 'Description\t%s' % (seq.info['Description']), '',] ### ~ [2] ~ PlotStats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for plot in rje.sortKeys(seq.dict['PlotStat']): odata.append('Plot\t%s\t%s' % (plot,string.join(seq.dict['PlotStat'][plot],', '))) if seq.dict['PlotStat']: odata.append('') ### ~ [3] ~ PlotFT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if seq.obj['Entry']: for ft in seq.obj['Entry'].list['Feature']: if ft['Type'] in self.list['PlotFT']: odata.append('Region\t%s %s\t%s:%s' % (ft['Type'],ft['Desc'],ft['Start'],ft['End'])) odata.append('') ### ~ [4] ~ MotifOcc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if occdata: for occ in occdata: odata.append('Motif\t%s\t%s:%s' % (occ['Pattern'],occ['Start_Pos'],occ['End_Pos'])) ### ~ [5] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### open(outfile,'w').write(string.join(odata,'\n')) self.printLog('#PLOT','SeqPlot output saved as %s' % (outfile)) except: self.errorLog(rje_zen.Zen().wisdom())
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log,self.cmd_list+['tuplekeys=T']) if self.baseFile().lower() in ['','none']: self.baseFile('%s.vs.%s.Q%d' % (rje.baseFile(self.getStr('MutPileup'),True),rje.baseFile(self.getStr('WTPileup'),True),self.getInt('QCut'))) if not self.force() and os.path.exists('%s.fdr.tdt' % self.baseFile()): return ### ~ [2] Look for/process WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.force() or not os.path.exists('%s.WT.tdt' % self.baseFile()): self.parsePileup('WT',self.getStr('WTPileup')) ### ~ [3] Generate Reference sequences and Major Alleles (by locus) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### refseq = {}; rx = 0 majors = {} locus = None WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 0 for line in WTDATA: self.progLog('\r#WT','Reading WT data: Reference seq length = %s nt' % (rje.iStr(rx)),rand=0.01) data = rje.readDelimit(line); wx += 1 if data[0] == 'Locus': continue else: if data[0] != locus: locus = data[0]; refseq[locus] = ''; majors[locus] = [] pos = int(data[1]) while (pos - 1) > len(refseq[locus]): refseq[locus] += '?'; rx += 1 while (pos - 1) > len(majors[locus]): majors[locus].append('-') refseq[locus] += data[2]; majors[locus].append(data[5]); rx += len(data[2]) WTDATA.close() self.printLog('\r#WT','%s lines read from WT data: Reference seq length = %s nt' % (rje.iStr(wx),rje.iStr(rx))) for locus in rje.sortKeys(majors): if len(majors[locus]) != len(refseq[locus]): self.errorLog('%s WTMajor versus RefSeq length mismatch!' % locus,printerror=False); raise ValueError self.dict['WTMajor'] = majors self.dict['RefSeq'] = refseq ### ~ [3] Look for/process Mutant Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.force() or not os.path.exists('%s.Mut.tdt' % self.baseFile()): self.parsePileup('Mut',self.getStr('MutPileup'),True) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def ppi(self): ### Remaining protein-protein interactions '''Remaining protein-protein interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['PPI']: return outdir = 'SLiMPID_PPI' rje.mkDir(self, outdir) badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for hub in rje.sortKeys(self.dict['PPI']): gene = self.dict['Gene'][hub] acc = [] for name in self.dict['PPI'][hub]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.ppi.acc' % (outdir, gene), 'w').write(string.join(acc, '\n')) self.printLog( '#PPI', '%s => %d individual interactors' % (gene, len(acc))) if badname: badname.sort() self.printLog( '#BAD', '%d "bad" protein names: %s' % (len(badname), string.join(badname, '; '))) except: self.errorLog('Problem with SLiMPID.setup()', quitchoice=True)
def processGenes(self,genelist): ### Tries to extract data for genes in genelist '''Tries to extract data for genes in genelist.''' ### ~ [1] Parse data from GeneCards (or existing data) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.deBug(self.list['Genes']) (gx,fx) = (0,0) try: for gene in genelist: if self.parseCard(gene): gx += 1 else: fx += 1 self.log.printLog('\r#CARD','Parsing GeneCards for %d genes: %d parsed; %d failed.' % (len(genelist),gx,fx),newline=False,log=False) self.log.printLog('\r#CARD','Parsing GeneCards for %d genes complete: %d parsed; %d failed.' % (len(genelist),gx,fx)) except KeyboardInterrupt: self.log.printLog('\r#CARD','Parsing GeneCards for %d genes stopped: %d parsed; %d failed.' % (len(genelist),gx,fx)) except: raise ### ~ [2] Tidy for mixed success ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (gx,gtot,cx) = (0.0,len(self.dict['GeneCard']),0) for alias in rje.sortKeys(self.dict['GeneCard']): self.log.printLog('\r#CARD','Checking and correcting partial successes: %.1f%%' % (gx/gtot),newline=False,log=False) gx += 100.0 if 'HPRD' in self.dict['GeneCard'][alias] and self.dict['GeneCard'][alias]['HPRD'] == alias: newalias = 'HPRD' + alias self.dict['GeneCard'][newalias] = self.dict['GeneCard'].pop(alias) alias = newalias try: symbol = self.dict['GeneCard'][alias]['Symbol'] except: #x#print 'F**k >> ', alias, self.dict['GeneCard'][alias], '<< F**k!!' self.log.errorLog('Problem with alias "%s"' % alias) continue if symbol in self.dict['GeneCard'] and self.dict['GeneCard'][symbol]['Symbol'] == '!FAILED!': self.dict['GeneCard'][symbol] = self.dict['GeneCard'][alias] cx += 1 self.log.printLog('\r#CARD','Checking and correcting partial successes: %d entries corrected.' % (cx))
def makePPIDatasets(self): ### Generate PPI datasets from pairwise data '''Generate PPI datasets from pairwise data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self, 'YeastPPI/') seqdict = self.dict['SeqDict'] ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx, htot, fx) = (0.0, len(self.dict['PPI']), 0) for hub in rje.sortKeys(self.dict['PPI']): self.progLog( '\r#FAS', 'Generating %s PPI fasta files: %.2f' % (rje.integerString(fx), hx / htot)) hx += 100.0 if len(self.dict['PPI'][hub]) < 3: continue seqs = [] for spoke in self.dict['PPI'][hub]: if spoke not in seqdict: continue seqs.append(seqdict[spoke]) if len(seqs) < 3: continue self.obj['SeqList'].saveFasta(seqs, rje.makePath('YeastPPI/%s.fas' % hub, wholepath=True), log=False) fx += 1 self.printLog( '\r#FAS', 'Generation of %s PPI fasta files from %s hubs complete.' % (rje.integerString(fx), rje.integerString(htot))) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def loadHHPID(self): ### Load HHPID interactions '''Load HHPID interactions.''' try: ### ~ [1] Setup HHPID Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStr('HHPID').lower() in ['', 'none']: return self.printLog('#HHPID', 'No HHPID file to load') hdb = self.db().addTable(self.getStr('HHPID'), mainkeys='auto', datakeys='All', name='HHPID') for field in [ '#Tax ID 1', 'Tax ID 2', 'product accession.version 2', 'last update timestamp' ]: hdb.dropField(field) hdb.renameField('Gene ID 1', 'EntrezHIV') hdb.renameField('product accession.version 1', 'AccHIV') hdb.renameField('product name 1', 'HIV') hdb.renameField('Interaction short phrase', 'Interaction') hdb.renameField('Gene ID 2', 'Entrez') hdb.renameField('product name 2', 'Description') hdb.renameField('PubMed ID (PMID) list', 'PMID') for itype in rje.sortKeys(hdb.index('Interaction')): self.printLog( '#HHPID', '%s => %s entries' % (itype, len(hdb.index('Interaction')[itype]))) hdb.dropEntriesDirect( 'Interaction', ['binds', 'complexes with', 'interacts with'], inverse=True) return True except: self.errorLog('%s.loadHHPID error' % self)
def save(self): ### Saves parsed REST output to files '''Saves parsed REST output to files.''' rbase = '%s%s' % (self.getStr('RestOutDir'), rje.baseFile(self.getStr('RestBase'), strip_path=True, keepext=True)) rje.mkDir(self, self.getStr('RestOutDir')) outputs = rje.sortKeys(self.dict['Output']) if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')] elif self.getStrLC('Rest') in ['full', 'text']: outfile = '%s.rest' % rbase open(outfile, 'w').write(self.restFullOutput()) self.printLog('#OUT', '%s: %s' % (self.getStrLC('Rest'), outfile)) return True elif self.getStrLC('Rest'): self.printLog( '#OUTFMT', 'REST output format "%s" not recognised.' % self.getStrLC('Rest')) if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'): return False outfile = '%s.rest' % rbase open(outfile, 'w').write(self.restFullOutput()) self.printLog('#OUT', 'full: %s' % (outfile)) return True for rkey in outputs: if rkey in self.dict['Outfile']: rje.backup(self, self.dict['Outfile'][rkey]) open(self.dict['Outfile'][rkey], 'w').write(self.dict['Output'][rkey]) self.printLog('#OUT', '%s: %s' % (rkey, self.dict['Outfile'][rkey])) elif rkey not in ['intro']: self.warnLog('No outfile parsed/generated for %s output' % rkey)
def setupCustomScores(callobj,statlist=[],scorelist=[],scoredict={}): ### Sets up Custom Scores using existing statlist ''' Sets up Custom Scores using existing statlist. >> callobj:RJE_Object [None] = calling object for Error Messages etc. >> statlist:list of stats that are allowed for custom score. Generally column headers for output. >> scorelist:list of Custom Score Names in order they were read in (may use prev. scores) >> scoredict:dictionary of Custom Scores: {Name:Formula} << (statlist,scorelist,scoredict):(list,list,dictionary) of acceptable Custom Scores ([Stats],[Names],{Name:Formula}) ''' try: ### Setup Custom Scores ### if not scorelist: scorelist = rje.sortKeys(scoredict) for new in scorelist[0:]: # self.dict['NewScore'] keys() in order they were read in if new in statlist: callobj.log.errorLog('Score "%s" exists: custom score cannot be made.' % (new),printerror=False) scorelist.remove(new) scoredict.pop(new) continue if not rje.formula(callobj,formula=scoredict[new],varlist=statlist[0:],check=True,calculate=False): callobj.log.errorLog('Custom score "%s" cannot be made.' % (new),printerror=False) scorelist.remove(new) scoredict.pop(new) continue statlist.append(new) return (statlist,scorelist,scoredict) ### Returns same things given ### except: callobj.log.errorLog('Problem during rje_scoring.setupCustomScores()',quitchoice=True) return scoredict
def complexFasta( self): ### Outputs parsed complex datasets in Fasta format '''Outputs parsed complex datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Complexes/') rje.mkDir(self, datpath) ### Output PPI Datasets ### for complex in rje.sortKeys(self.dict['Complex']): mylist = [] for p2 in self.dict['Complex'][complex]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath, complex) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile) self.log.printLog('#FAS', 'HPRD complex fasta output complete.') except: self.log.errorLog('Error in HPRD.complexFasta()', printerror=True, quitchoice=False) raise
def codonUsageEntropyBias(self): ### Calculate bias in Codon Usage using Entropy-based measure '''Calculate bias in Codon Usage using Entropy-based measure.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### aacode = self.db('Code').index('AA') nt = ['C','A','G','U']; codons = rje.sortKeys(rje_sequence.genetic_code) cdb = self.db('Codons'); edb = self.db('Expected') ## ~ [1a] Setup bias table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## bdb = self.db().addEmptyTable('Bias',['Seq','Len','Bias','ExpBias','WtBias','ExpWtBias'],['Seq']) ### ~ [2] Calculate Frequencies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### x = 0.0; etot = cdb.entryNum() for codentry in cdb.entries(): self.progLog('\r#BIAS','Calculating Bias: %.2f%%' % (x/etot)); x += 100.0 expentry = edb.data(codentry['Seq']) entry = {'Seq':codentry['Seq'],'Len':codentry['Len'],'Bias':0.0,'ExpBias':0.0,'WtBias':0.0,'ExpWtBias':0.0} aafreq = {} for aa in aacode: aafreq[aa] = 0.0 for code in aacode[aa]: aafreq[aa] += codentry[code] rje.dictFreq(aafreq,total=False) for aa in aacode: entry['Bias'] += rje.entropyDict(codentry,aacode[aa]) entry['ExpBias'] += rje.entropyDict(expentry,aacode[aa]) entry['WtBias'] += (aafreq[aa] * rje.entropyDict(codentry,aacode[aa])) entry['ExpWtBias'] += (aafreq[aa] * rje.entropyDict(expentry,aacode[aa])) bdb.addEntry(entry) self.printLog('\r#BIAS','Codon Usage entropy bias calculated for %s entries' % rje.iStr(etot)) bdb.saveToFile() except: self.errorLog('%s.expectedCodonUsage error' % self)
def xmerProb(self,xmer,prefix=False): ### Returns SCAP probability for count for given xmer from markov.dict tree ''' Returns count for given xmer from self.dict tree. >> xmer:str = Xmer of interest >> prefix:bool [False] = Use Prefix tree rather than suffix tree ''' ### ~ [1] ~ Choose tree ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### markov = self.obj['Markov'] if markov.opt['Sorted'] and len(xmer) < markov.stat['MaxXmer']: return 1.0 if prefix: _sufdic = markov.pretree(); xmer = rje.strReverse(xmer) else: _sufdic = markov.suftree() if markov.opt['Sorted']: xmer = rje.strSort(xmer[:-1]) + xmer[-1] prex = 0 self.deBug('%s :: %s' % (xmer,rje.sortKeys(_sufdic))) ### ~ [2] ~ Find subtree ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for x in range(len(xmer)): if xmer[x] in _sufdic.keys(): if not _sufdic.has_key('e'): _sufdic['e'] = 0.0 for a in markov.list['Alphabet']: if _sufdic.has_key(a): fa = _sufdic[a]['='] / float(_sufdic['=']) _sufdic['e'] += (fa * fa) prex = _sufdic['='] * _sufdic['e']; _sufdic = _sufdic[xmer[x]] elif xmer[x] not in markov.list['Alphabet'] or not prex: return 1.0 else: return 0.5 / (0.05 * prex) # Arbitrary small number! self.deBug('%s %d [%s] :: %d = %s' % (xmer,x,xmer[x],prex,_sufdic)) ### ~ [3] ~ Calculate SCAP value ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if prex: return _sufdic['='] / prex else: return 1.0
def processHHPID(self): ### Process HHPID interactions '''Process HHPID interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if rje.checkForFile('%s.HHPIDMap.tdt' % self.basefile()): mdb = self.db().addTable('%s.HHPIDMap.tdt' % self.basefile(), ['HIV', 'Gene'], 'All', name='HHPIDMap') return mdb hdb = self.db('HHPID') gdb = self.db('GeneMap') pdb = self.db('PPI') mdb = self.db().joinTables(name='HHPIDMap', join=[(hdb, 'Entrez'), (gdb, 'Entrez')], newkey=['#'], empties=False, keeptable=True) for field in mdb.fields()[0:]: if field not in [ '#', 'AccHIV', 'EntrezHIV', 'HIV', 'Entrez', 'Gene', 'Symbol', 'UniProt', 'EnsEMBL', 'EnsLoci' ]: mdb.dropField(field) mdb.compress(['HIV', 'Gene'], default='str') mdb.dropField('#') mdb.saveToFile() ### ~ [2] Save viral accession numbers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### open('%s.hivacc' % self.getStr('Basefile'), 'w').write( '%s\n' % string.join(rje.sortKeys(mdb.index('AccHIV')), '\n')) return mdb except: self.errorLog('%s.processHHPID error' % self) return False
def runJobs( self ): ### Runs all the jobs in self.list['SubJobs'] #V1.0 '''Runs all the jobs in self.list['SubJobs'].''' ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for j in range(self.getInt('KeepFree'), self.nprocs()): self.nextJob(j) # Skip first node(s) pidcheck = '%s.pid' % rje.baseFile(self.log.info['LogFile']) ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while self.dict['Running']: PIDCHECK = open(pidcheck, 'w') for j in rje.sortKeys(self.dict['Running']): if not self.dict['Running'][j]: self.dict['Running'].pop(j) continue # No more jobs try: pid = self.dict['Running'][j]['PID'] PIDCHECK.write('%s: %s\n' % (j, pid)) if string.split('%s' % pid)[0] == 'WAIT': status = 1 else: (status, exit_stat) = os.waitpid(pid, os.WNOHANG) except: status = 1 if status > 0: self.endJob( j ) # subjob on processor j has finished: can replace with processing PIDCHECK.close() time.sleep(self.getInt('SubSleep'))
def outputCards(self): ### Outputs cards to delimited file '''Outputs cards to delimited file.''' ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### genelist = self.list['Genes'] if self.opt['Purify'] and self.opt['Restrict']: for gene in genelist[0:]: if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: # Replace with symbol genelist.remove(gene) if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol']) delimit = rje.delimitFromExt(filename=self.info['CardOut']) CARDOUT = open(self.info['CardOut'],'a') ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (noens,noloci,ox) = (0,0,0) for gene in rje.sortKeys(self.dict['GeneCard']): if self.opt['Restrict'] and gene not in genelist: continue elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1 self.dict['GeneCard'][gene]['Alias'] = gene self.dict['GeneCard'][gene]['Species'] = self.info['Species'] rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene]) if self.dict['GeneCard'][gene]['Symbol'] == gene: # Not an alias if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1 if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1 CARDOUT.close() self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut'])) self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))
def addLinks(self,nested): ### Adds href aname links to definitions. '''Adds href aname links to definitions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### endstrip = [')','.',',',':',';','!'] if self.getBool('Plurals'): endstrip.append('s') for term in rje.sortKeys(nested): if term == '=': linkdef = [] rawdef = string.split(string.replace(nested['='],'(','( ')) while rawdef: glossary = self.dict['Glossary'] if self.getBool('HRef') and rje.matchExp('<(\S+)>',rawdef[0]): safetynet = rawdef[0:] url = rje.matchExp('<(\S+)>',rawdef[0])[0] if rje.matchExp('<(\S+)>\[(\S+)',rawdef[0]): rawdef[0] = '[%s' % rje.matchExp('<(\S+)>\[(\S+)',rawdef[0])[1] elif rje.matchExp('<(\S+)>(\S+)',rawdef[0]): rawdef[0] = '[%s]%s' % (url,rje.matchExp('<(\S+)>(\S+)',rawdef[0])[1]) else: rawdef[0] = '[%s]' % url try: while ']' not in rawdef[0]: rawdef[0] = '%s %s' % (rawdef[0],rawdef.pop(1)) (linktext,linkextra) = rje.matchExp('\[(.+)\](\S*)',rawdef.pop(0)) if url[:3] not in ['htt','ftp']: url = 'http://%s' % url linkdef.append('<a href="%s">%s</a>%s' % (url,linktext,linkextra)) continue except: self.errorLog('Problem parsing URL from "%s"' % nested['=']) rawdef = safetynet if rawdef[0].lower() not in glossary: if rawdef[0].lower()[:-1] not in glossary or rawdef[0].lower()[-1] not in endstrip: linkdef.append(rawdef.pop(0)); continue akey = []; alink = [] while rawdef and (rawdef[0].lower() in glossary or rawdef[0].lower()[:-1] in glossary): if rawdef[0].lower() in glossary and '=' in glossary[rawdef[0].lower()]: rterm = rawdef[0].lower() elif len(rawdef) > 1 and rawdef[0].lower() in glossary and (rawdef[1].lower() in glossary[rawdef[0].lower()] or rawdef[1].lower()[:-1] in glossary[rawdef[0].lower()]): rterm = rawdef[0].lower() elif rawdef[0].lower()[-1] in endstrip and rawdef[0].lower()[:-1] in glossary: rterm = rawdef[0].lower()[:-1] elif rawdef[0].lower() in glossary: rterm = rawdef[0].lower() else: break glossary = glossary[rterm] akey.append(rterm) alink.append(rawdef.pop(0)) akey = string.join(akey,'_') if '=' in glossary: alink = string.join(alink) if nested == glossary: linkdef.append(alink) elif self.getStr('HTMLStyle') != 'tab': if alink[-1] in endstrip and alink[-1] != 's': linkdef.append('<a href="#%s">%s</a>%s' % (akey,alink[:-1],alink[-1])) else: linkdef.append('<a href="#%s">%s</a>' % (akey,alink)) else: if alink[-1] in endstrip and alink[-1] != 's': linkdef.append('<scaps>%s</scaps>%s' % (alink[:-1],alink[-1])) else: linkdef.append('<scaps>%s</scaps>' % (alink)) else: linkdef.append(alink[0]) rawdef = alink[1:] + rawdef nested['+'] = string.replace(string.join(linkdef),'( ','(') while rje.matchExp(' _([^_]+)_',nested['+']): italics = rje.matchExp(' _([^_]+)_',nested['+'])[0] nested['+'] = string.replace(nested['+'],' _%s_' % italics,' <i>%s</i>' % italics) #self.deBug(nested) elif term != '+': self.addLinks(nested[term]) except: self.errorLog('%s.addLinks error' % self)
def saveMutations(self): ### Outputs parsed mutations into a delimited file '''Outputs parsed mutations into a delimited file.''' try:### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease'] outfile = 'omim_mutations.tdt' rje.delimitedFileOutput(self,outfile,headers,'\t',rje_backup=True) ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for gene in rje.sortKeys(self.dict['Mutations']): for subid in rje.sortKeys(self.dict['Mutations'][gene]): (disease,mutation) = self.dict['Mutations'][gene][subid] (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation) datadict = {'OMIM_ID':string.join(self.dict['Records'][gene],'; '),'SubID':subid,'Gene':gene, 'Pos':pos,'WildAA':wild,'MutAA':mut,'Disease':disease} rje.delimitedFileOutput(self,outfile,headers,'\t',datadict) self.log.printLog('#OUT','OMIM Mutation output to %s complete' % outfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def fpi(self): ### Family-protein interactions '''Family-protein interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_FPI' rje.mkDir(self,outdir) fpi = {} # Dictionary of {family:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for qry in rje.sortKeys(self.dict['PPI']): try: fam = self.dict['Fam'][qry] if len(fam) < 2: continue except: self.errorLog('Problem with "%s" protein family' % qry); continue fpi[qry] = [] for hub in fam: if hub not in self.dict['PPI']: continue fpi[qry] += self.dict['PPI'][hub] # Add with redundancy for spoke in fpi[qry][0:]: if fpi[qry].count(spoke) == 1: fpi[qry].remove(spoke) # Must have 2+ family interactions for hub in fam: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in fpi[qry]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) fpi[qry] = rje.sortUnique(fpi[qry],False,False) acc = [] gene = self.dict['Gene'][qry] for name in fpi[qry]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.fpi.acc' % (outdir,gene),'w').write(string.join(acc,'\n')) self.printLog('#FPI','%s family => %d interactors' % (gene,len(acc))) if badname: badname.sort() self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#FPI','No %s PPI left after FPI removed' % hub) self.printLog('#PPX','%s of %s PPI hubs remain after FPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.fpi()',quitchoice=True)
def domainFasta(self): ### Outputs parsed domain and domain PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Domain','HPRD','Gene'] dfile = self.info['OutDir'] + 'HPRD.domains.tdt' rje.delimitedFileOutput(self,dfile,headers,'\t') sfile = self.info['OutDir'] + 'HPRD.domsource.tdt' shead = ['Domain','Source'] rje.delimitedFileOutput(self,sfile,shead,'\t') dx = 0.0 for domain in rje.sortKeys(self.dict['Domains']): self.log.printLog('\r#DOM','HPRD Domain output (%s): %.1f%%' % (dfile,dx/len(self.dict['Domains'])),newline=False,log=False) dx += 100.0 for hid in self.dict['Domains'][domain]: datadict = {'Domain':domain,'HPRD':hid,'Gene':self.dict['HPRD'][hid]['gene']} rje.delimitedFileOutput(self,dfile,headers,'\t',datadict) for source in self.dict['DomainSource'][domain]: datadict = {'Domain':domain,'Source':source} rje.delimitedFileOutput(self,sfile,shead,'\t',datadict) self.log.printLog('\r#DOM','HPRD Domain output (%s): %s domains.' % (dfile,rje.integerString(len(self.dict['Domains'])))) ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Domain_Datasets/') rje.mkDir(self,datpath) for domain in rje.sortKeys(self.dict['Domains']): ## Generate a list of all interactors with domain-containing proteins ## plist = [] for p1 in self.dict['Domains'][domain]: if p1 not in self.dict['PPI']: continue for p2 in self.dict['PPI'][p1]: if p2 not in plist: plist.append(p2) plist.sort() ## Generate Sequence list and output ## mylist = [] for p in plist: if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq'] else: mylist.append(self.dict['HPRD'][p]['Seq']) sfile = '%s%s_hprd.fas' % (datpath,domain) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile) else: self.log.printLog('#DOM','No PPI partners for domain "%s"' % domain) self.log.printLog('\r#DOM','HPRD Domain fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False) raise
def loadFeatures(self, ftfile): ### Loads features from given file '''Loads features from given file.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if ftfile in ['', 'none']: return if not os.path.exists(ftfile): return self.printLog('#ERR', 'Features file "%s" missing') delimit = rje.delimitFromExt(filename=ftfile) ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = rje.readDelimit(open(ftfile, 'r').readline(), delimit) mainkeys = [headers[0]] hmap = {} for h in headers: hmap[h.lower()] = h pos = '' # Leader for start/end positions if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_' for h in [ 'feature', '%sstart' % pos, '%send' % pos, 'description' ]: if h not in hmap: return self.printLog( '#ERR', 'No %s field detected in "%s" features file' % (h, ftfile)) mainkeys.append(hmap[h]) mainkeys.remove(hmap['description']) ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ftdata = rje.dataDict(self, ftfile, mainkeys, ['description'], delimit, headers, lists=True) (mx, mtot, fx) = (0.0, len(ftdata), 0) for mainkey in rje.sortKeys(ftdata): self.progLog( '\r#FT', 'Loading features from %s: %.2f%%' % (ftfile, mx / mtot)) mx += 100.0 (id, ft, start, end) = string.split(mainkey, delimit) if id == mainkeys[0]: continue if id not in self.dict['Features']: self.dict['Features'][id] = [] for desc in ftdata[mainkey][hmap['description']]: fx += 1 self.dict['Features'][id].append({ 'Type': ft, 'Start': int(start), 'End': int(end), 'Desc': desc }) self.printLog( '\r#FT', 'Loaded %s features for %s IDs from %s' % (rje.integerString(fx), rje.integerString(len(self.dict['Features'])), ftfile)) except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
def makeChildren(self): ### Goes through GO dictionary and adds 'child_terms' to dictionary '''Goes through GO dictionary and adds 'child_terms' to dictionary.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### go = self.go() for id in rje.sortKeys(go): go[id]['child_terms'] = [] (gx,gtot) = (0.0,len(go)*2) ### ~ [2] ~ Add children ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for id in rje.sortKeys(go): self.progLog('#GO','Making GO children: %.2f%%' % (gx/gtot)); gx += 100.0 for term in self.list['ParentTerms']: if term in go[id]: for parent in go[id][term]: if parent not in go: self.errorLog('%s %s ID "%s" missing!' % (id,term,parent),printerror=False) else: go[parent]['child_terms'].append(id) for id in rje.sortKeys(go): self.progLog('#GO','Making GO children: %.2f%%' % (gx/gtot)); gx += 100.0 go[id]['child_terms'].sort() self.printLog('\r#GO','Making GO children complete.') except: self.errorLog('Major problem with GO.makeChildren()') raise
def dpi(self): ### Domain-protein interactions '''Domain-protein interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_DPI' rje.mkDir(self,outdir) dpi = {} # Dictionary of {domain:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['Domain']): dpi[dom] = [] for hub in self.dict['Domain'][dom]: if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][hub] # Add with redundancy for spoke in dpi[dom][0:]: if dpi[dom].count(spoke) == 1: dpi[dom].remove(spoke) # Must have 2+ domain interactions for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in dpi[dom]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) dpi[dom] = rje.sortUnique(dpi[dom],False,False) acc = [] for name in dpi[dom]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.dpi.acc' % (outdir,dom),'w').write(string.join(acc,'\n')) self.printLog('#DPI','%s domain => %d interactors' % (dom,len(acc))) if badname: badname.sort() self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DPI','No %s PPI left after DPI removed' % hub,screen=False) self.printLog('#PPX','%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.dpi()',quitchoice=True)
def setup(self): ### Sets up headers and reads in existing data if present '''Sets up headers and reads in existing data if present.''' try: ### ~ Setup Basic Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #X#headers = ['Alias','Species','Symbol','HGNC','Entrez','UniProt','EnsEMBL','HPRD','OMIM','EnsLoci','Desc'] headers = ['Alias','Species'] + gc_headers # All other headers added from altsource list ### ~ Read in data from existing files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.readHGNC() if self.opt['Update'] and os.path.exists(self.info['CardOut']): self.list['AltSource'].append(self.info['CardOut']) for altsource in self.list['AltSource']: sourcefile = rje.makePath(altsource,True) if not os.path.exists(sourcefile): self.log.errorLog('Alternative source "%s" missing!' % sourcefile,printerror=False,quitchoice=True) continue update = rje.dataDict(self,sourcefile,getheaders=True,ignore=['#']) for h in update.pop('Headers'): if h not in headers: headers.append(h) self.log.printLog('#DATA','Read GeneCards data for %d genes.' % (len(update))) for gene in rje.sortKeys(update): # Each source will overwrite data from the file before ## ~ Convert to Upper Case for consistency ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if gene != gene.upper() and gene.upper() in update: continue # Only use upper case one! elif gene != gene.upper(): update[gene.upper()] = update.pop(gene) gene = gene.upper() if gene == '!FAILED!': continue ## ~ Update main dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['Update'] and altsource == self.info['CardOut'] and gene not in self.list['Genes']: self.list['Genes'].append(gene) if gene in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][gene],update[gene]) else: self.dict['GeneCard'][gene] = update[gene] ## ~ Temp Debugging ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if gene in self.list['TestGenes']: print gene print update[gene] self.deBug(self.dict['GeneCard'][gene]) ## ~ Check Aliases etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'Symbol' in self.dict['GeneCard'][gene]: self.dict['GeneCard'][gene]['Symbol'] = self.dict['GeneCard'][gene]['Symbol'].upper() if 'Symbol' in update[gene] and update[gene]['Symbol'] != '!FAILED!': symbol = update[gene]['Symbol'] if symbol in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][symbol],update[gene],overwrite=False,replaceblanks=True) else: self.dict['GeneCard'][symbol] = update[gene] self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard'])),newline=False,log=False) if len(string.split(gene)) > 1: print '!!!', gene, '!!!' ### ~ Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard']))) self.list['Headers'] = headers[0:] if self.opt['Update']: self.opt['Append'] = False #x#if 'TASP1' in self.dict['GeneCard']: self.deBug(self.dict['GeneCard']['TASP1']) #x#else: self.deBug(rje.sortKeys(self.dict['GeneCard'])) except: self.log.errorLog('Problem during GeneCards.setup()') raise
def setup(self,gtext=''): ### Main class setup method. gtext will over-ride input file. '''Main class setup method. gtext will over-ride input file.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['HTML'] = rje_html.HTML(self.log,self.cmd_list) ## ~ [1a] File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('InFile'))) if self.getStr('OutFile').lower() in ['','none']: self.str['OutFile'] = '%s.html' % self.basefile() ## ~ [1b] Read in Glossary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## interms = [] if gtext: delimit = self.getStr('TermSplit') if delimit.lower() == 'tab': delimit = '\t' if delimit.lower() == 'space': delimit = ' ' if delimit.lower() == 'comma': delimit = ',' if delimit.lower() == 'period (.)': delimit = '.' if delimit.lower() == 'colon': delimit = ':' glossary = {} for line in string.split(gtext,'\n'): splitline = string.split(line,delimit) if delimit == '.' and (splitline[-1] in ['',' ']): splitline = splitline[:-1] if not splitline: continue (term,definition) = (splitline[0],string.join(splitline[1:],delimit)) if term == 'Term' and not glossary: continue if term: glossary[term] = {'Definition':definition} interms.append(term) else: try: if not self.getBool('KeepOrder') and open(self.getStr('InFile'),'r').readline()[:4] == 'Term': glossary = rje.dataDict(self,self.getStr('InFile'),mainkeys=['Term'],datakeys=['Term','Definition']) else: return self.setup(open(self.getStr('InFile'),'r').read()) except: self.errorLog('Problem reading input as dataDict(). Will try as text.') return self.setup(open(self.getStr('InFile'),'r').read()) if self.list['Terms']: for term in glossary: if term not in self.list['Terms']: glossary.pop(term) elif self.getBool('KeepOrder'): self.list['Terms'] = interms else: self.list['Terms'] = rje.sortKeys(glossary) for term in glossary: glossary[term] = glossary[term]['Definition'] ### ~ [2] Create Full Glossary Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nested = {} for term in glossary: tdict = nested for word in string.split(term.lower()): if word not in tdict: tdict[word] = {} tdict = tdict[word] tdict['='] = glossary[term] self.dict['Glossary'] = nested return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def sgd2sp(self): ### Reformats yeast sequence names and outputs new data for GOPHER '''Reformats yeast sequence names and outputs new data for GOPHER.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### inseq = self.obj['SeqList'] uni = rje_uniprot.UniProt(self.log,self.cmd_list+['datout=None']) xref = self.db('XRef') self.dict['Rename'] = {} ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ufile = '%s.dat' % self.info['Basefile'] if os.path.exists(ufile) and not self.opt['Force']: uni.readUniProt(ufile,clear=True,cleardata=False) else: uni.readUniProt(clear=True,acclist=rje.sortKeys(xref.index('UniProt')),cleardata=False) uni.saveUniProt(ufile) ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniseq = {} for entry in uni.entries(): seq = entry.obj['Sequence'] uniseq[seq.info['AccNum']] = seq self.printLog('\r#USEQ','%s UniProt Sequences extracted (%s Ensembl AccNum)' % (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt'))))) ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### yseq = [] # List of YEAST sequence objects (sx,stot) = (0.0,inseq.seqNum()) for seq in inseq.seqs(): self.progLog('\r#SEQ','Reformatting sequence names: %.2f%%' % (sx/stot)); sx += 100.0 if seq.info['SpecCode'] != 'YEAST': continue yseq.append(seq) sgd = seq.info['AccNum']; newname = seq.info['Name'] try: for x in xref.indexEntries('EnsG',sgd): acc = x['UniProt'] if acc: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD'],acc) else: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD']); continue if acc not in uniseq: self.printLog('\r#UNIERR','Unable to find UniProt sequence %s (%s)' % (acc,sgd)); continue useq = uniseq[acc] if useq.info['Sequence'] != seq.info['Sequence']: self.printLog('\r#SEQERR','%s sequence <> %s sequence' % (sgd,acc)); continue nsplit = string.split(newname) nsplit[0] = '%s__%s' % (x['UniprotID'],acc) newname = string.join(nsplit) self.dict['Rename'][sgd] = acc break except: self.errorLog('%s problem' % sgd) seq.info['Name'] = newname seq.extractDetails(gnspacc=True) self.printLog('\r#SEQ','Reformatting sequence names complete.') ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.exists('%s.ygob.fas' % self.info['Basefile']): inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile']) if not rje.exists('%s.yeast.fas' % self.info['Basefile']): inseq.saveFasta(seqs=yseq,seqfile='%s.yeast.fas' % self.info['Basefile']) self.list['YeastSeq'] = inseq.accList(yseq) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def summaryScores(self,rankdb=None,sumstr='taxasum',minsum='MinSum'): ### Generates summary scores from rank table. '''Generates summary scores from rank table.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() if not rankdb: rankdb = self.db('taxamap') sumdb = db.addEmptyTable(sumstr,['rank','taxon','count','bootwt','meanboot','perc','wtperc'],['rank','taxon']) ranks = ['genus','family','order','class','phylum'] ### ~ [2] Normalise to reduced levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for rank in ranks: self.printLog('\r#RANK','Normalising %s data.' % rank) taxsum = {}; ranksum = 0.0 # Summed counts for taxa and rank total taxwt = {}; wtsum = 0.0 # Bootstrap-weighted summed counts for taxa and rank total bootsum = {}; bootx = {} # Sum and count of bootstrap values for mean boot numbers for entry in rankdb.entries(): taxa = string.split(entry[rank],'|') for taxon in taxa: if taxon in self.list['TaxFilter']: continue if taxon not in taxsum: taxsum[taxon] = 0.0; taxwt[taxon] = 0.0 bootsum[taxon] = 0.0; bootx[taxon] = 0 taxsum[taxon] += 1.0 / len(taxa) ranksum += 1.0 / len(taxa) taxweight = entry['boot'] bootsum[taxon] += entry['boot']; bootx[taxon] += 1 taxwt[taxon] += taxweight / len(taxa) wtsum += taxweight / len(taxa) otherx = 0 for taxon in rje.sortKeys(taxsum): if taxon == 'Other': continue if taxsum[taxon] < self.getNum(minsum): if 'Other' not in taxsum: taxsum['Other'] = 0.0 taxwt['Other'] = 0.0 bootsum['Other'] = 0.0 bootx['Other'] = 0.0 taxsum['Other'] += taxsum.pop(taxon) taxwt['Other'] += taxwt.pop(taxon) bootsum['Other'] += bootsum.pop(taxon) bootx['Other'] += bootx.pop(taxon) otherx += 1 self.printLog('#MINSUM','%s %s taxa converted to "Other" (count < minsum=%s)' % (rje.iStr(otherx),rank,self.getNum(minsum))) for taxon in taxsum: sumdb.addEntry({'rank':rank,'taxon':taxon,'count':rje.dp(taxsum[taxon],1), 'perc':rje.sf(100.0*taxsum[taxon]/ranksum), 'bootwt':rje.dp(taxwt[taxon],1),'meanboot':rje.dp(bootsum[taxon]/bootx[taxon],3), 'wtperc':rje.sf(100.0*taxwt[taxon]/wtsum)}) ## ~ [2a] Rank taxa by counts such that highest is Rank 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sumdb.rankFieldByIndex('rank','count',rev=True,absolute=True,lowest=True) sumdb.rankFieldByIndex('rank','bootwt',rev=True,absolute=True,lowest=True) ## ~ [2b] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sumdb.saveToFile() except: self.errorLog('%s.summaryScores error' % self.prog())
def pileUpFDR(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### fdrfile = '%s.fdr.tdt' % self.baseFile() if not self.force() and os.path.exists(fdrfile): return sigpval = {} # pval:[fpos] npos = 0; nx = 0 for locus in rje.sortKeys(self.dict['RefSeq']): npos += len(self.dict['RefSeq'][locus]) - self.dict['RefSeq'][locus].count('?') ### ~ [1] Parse out stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r') headers = string.split(SAMSIG.readline()) + ['p.FDR'] fpos = SAMSIG.tell(); fline = SAMSIG.readline(); px = 0 while fline: self.progLog('\r#SIG','Reading Pvalues: %s p <= 0.05...' % rje.iStr(px)) try: pval = float(string.split(fline)[-1]) except: break if pval <= 0.05: if pval not in sigpval: sigpval[pval] = [] sigpval[pval].append(fpos); px += 1 fpos = SAMSIG.tell(); fline = SAMSIG.readline() self.printLog('\r#SIG','Reading Pvalues complete: %s p <= 0.05.' % rje.iStr(px)) ### ~ [2] Calculate FDR and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMFDR = open(fdrfile,'w') rje.writeDelimit(SAMFDR, headers) px = 0; sx = 0.0; stot = len(sigpval) for pval in rje.sortKeys(sigpval): self.progLog('\r#FDR','Calculating FDR: %.2f%%' % (sx/stot)); sx += 100.0 px += len(sigpval[pval]) if pval: fdr = (pval * npos) / px else: fdr = 0.0 for fpos in sigpval[pval]: SAMSIG.seek(fpos) rje.writeDelimit(SAMFDR,rje.readDelimit(SAMSIG.readline())+[rje.expectString(fdr)]) SAMSIG.close() SAMFDR.close() self.printLog('\r#FDR','%s FDR lines output to %s' % (rje.iStr(px),fdrfile)) except: self.errorLog('%s.pileUpFDR() error' % (self)); return None
def powerGO(self,numbers,sig=0.01,samples='all',total='Total',countkey='counts',ignore=[]): ### Special GO power calculation for GO slim set ''' Special GO power calculation for GO slim set. >> numbers:dictionary of {Sample:Count} >> sig:float [0.01] = Desired significance level to achieve. Currently uncorrected. Add Bonf/FDR with time. >> samples:str ['all'] = Whether sig must be achievable for 'any' or 'all' samples. >> total:str ['Total'] = Sample containing Total counts to compare against >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample - self.go(id)[countkey] = {Sample:count} >> ignore:list of Samples to ignore from calculation << returns a list of GO IDs that meet criteria ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### N = numbers[total] # Total count for calculating expectations/probabilities nlist = [] # List of counts for subsamples to be assessed for sample in numbers: if sample not in ignore + [total]: nlist.append(numbers[sample]) nlist = rje.sortUnique(nlist,xreplace=False,num=True) ### ~ [2] ~ Generate Power Range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### plist = [] # List of acceptable Total counts for subset nx = 0.0 for i in range(1,N+1): # Look at all possible levels of occurrence self.progLog('#POW','Calculating GO term power: %.1f%%' % (nx/N)) nx += 100.0 ok = 0 p = float(i) / N # Probability of each gene having this term for n in nlist: # Look at each subset k1 = min(i,n) # Want to look at largest possible count for sample-term pairing k2 = max(0,n-(N-i)) # Also want to look at the likelihood of under-representation if rje.binomial(k1,n,p,callobj=self) <= sig: ok += 1 elif (1 - rje.binomial(k2+1,n,p,callobj=self)) <= sig: ok += 1 #!# Add under-representation too! #!# if ok and samples == 'any': break if (ok and samples == 'any') or ok == len(nlist): plist.append(i) self.printLog('\r#POW','Calculation of GO term power complete.',log=False) self.deBug(nlist) ### ~ [3] ~ Generate GO Slim ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### terms = [] (ix,itot) = (0.0,len(self.go())) for id in rje.sortKeys(self.go()): self.progLog('#POW','Assessing terms for power: %.1f%% (%s terms)' % (ix/itot,rje.iLen(terms))) ix += 100.0 if self.go(id)[countkey][total] in plist: terms.append(id) self.printLog('\r#POW','Assessed terms for statistical power, p <= %s: %s GO terms' % (sig,rje.iLen(terms))) #!# Add correction terms #!# self.deBug(terms) return terms except: self.errorLog('Major problem with GO.powerGO()') return []
def saveFasta(self): ### Outputs parsed PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Datasets/') rje.mkDir(self,datpath) ## Check Seqs ## for p1 in rje.sortKeys(self.dict['PPI']): if 'Seq' not in self.dict['HPRD'][p1]: #!# KeyError #!# print p1, self.dict['HPRD'][p1] self.deBug('No Seq for %s' % p1) ### All sequences ### self.obj['SeqList'].saveFasta() ### Output PPI Datasets ### for p1 in rje.sortKeys(self.dict['PPI']): mylist = [] for p2 in self.dict['PPI'][p1]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath,self.dict['HPRD'][p1]['gene']) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile) self.log.printLog('#FAS','HPRD PPI fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False)
def saveMutations( self): ### Outputs parsed mutations into a delimited file '''Outputs parsed mutations into a delimited file.''' try: ### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = [ 'OMIM_ID', 'SubID', 'Gene', 'Pos', 'WildAA', 'MutAA', 'Disease' ] outfile = 'omim_mutations.tdt' rje.delimitedFileOutput(self, outfile, headers, '\t', rje_backup=True) ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for gene in rje.sortKeys(self.dict['Mutations']): for subid in rje.sortKeys(self.dict['Mutations'][gene]): (disease, mutation) = self.dict['Mutations'][gene][subid] (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)', mutation) datadict = { 'OMIM_ID': string.join(self.dict['Records'][gene], '; '), 'SubID': subid, 'Gene': gene, 'Pos': pos, 'WildAA': wild, 'MutAA': mut, 'Disease': disease } rje.delimitedFileOutput(self, outfile, headers, '\t', datadict) self.log.printLog('#OUT', 'OMIM Mutation output to %s complete' % outfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def test(self): ### Development method '''Development method.''' self.readGO() self.mapEnsGO() gohead = ['EnsG','GO_ID','GO_Type','GO_Desc'] gofile = 'test.go.tdt' rje.delimitedFileOutput(self,gofile,gohead,rje_backup=True) gx = 0.0; gtot = len(self.dict['EnsGO']) for gene in rje.sortKeys(self.dict['EnsGO']): self.progLog('\r#ENSGO','Compiling %s: %.2f%%' % (gofile,gx/gtot)); gx += 100.0 for goid in self.dict['EnsGO'][gene]: godata = {'EnsG':gene, 'GO_ID':goid} godata['GO_Type'] = self.dict['GO'][goid]['type'] godata['GO_Desc'] = self.dict['GO'][goid]['name'] rje.delimitedFileOutput(self,gofile,gohead,datadict=godata) self.printLog('\r#ENSGO','Compiling %s all done: %s genes.' % (gofile,rje.integerString(gtot)))
def addEnsLoci(self): ### Adds EnsLoci data to Gene Information '''Adds EnsLoci data to Gene Information.''' if not self.dict['EnsLoci']: return ex = 0 for gene in rje.sortKeys(self.dict['GeneCard']): if not self.dict['GeneCard'][gene].has_key('EnsEMBL') or not self.dict['GeneCard'][gene]['EnsEMBL']: if self.dict['GeneCard'][gene].has_key('UniProt') and self.dict['GeneCard'][gene]['UniProt'] in self.dict['UniEns']: self.dict['GeneCard'][gene]['EnsEMBL'] = self.dict['UniEns'][self.dict['GeneCard'][gene]['UniProt']] if self.dict['GeneCard'][gene].has_key('EnsEMBL') and self.dict['GeneCard'][gene]['EnsEMBL'] in self.dict['EnsLoci']: ex += 1 self.dict['GeneCard'][gene]['EnsLoci'] = self.dict['EnsLoci'][self.dict['GeneCard'][gene]['EnsEMBL']] self.dict['GeneCard'][gene]['EnsDesc'] = self.dict['EnsDesc'][self.dict['GeneCard'][gene]['EnsEMBL']] # EnsEMBL genes might be missing as they might be pseudogenes etc. #x#elif self.dict['GeneCard'][gene].has_key('EnsEMBL'): self.log.errorLog('EnsEMBL Gene "%s" missing from EnsLoci dictionary!' % self.dict['GeneCard'][gene]['EnsEMBL'],printerror=False) self.log.printLog('\r#ENS','Adding EnsLoci data: %d of %d genes' % (ex,len(self.dict['GeneCard'])),newline=False,log=False) self.log.printLog('\r#ENS','Added EnsLoci data for %d of %d genes' % (ex,len(self.dict['GeneCard'])))
def makeGOGenes(self,gokey='EnsGO'): ### Makes a dictionary of {GO:[Genes]} '''Makes a dictionary of {GO:[Genes]}.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if gokey in ['name','is_a','part_of','type','child_terms']: self.errorLog('Cannot have "%s" as GOGenes key - reserved for GO' % gokey); raise ValueError if gokey not in self.dict: self.errorLog('"%s" mappings missing!' % gokey); raise ValueError ### ~ [2] ~ Process GO ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (gx,gtot,ix) = (0.0,len(self.dict[gokey]),0) for gene in rje.sortKeys(self.dict[gokey]): self.progLog('\r#GENES','Making GO Gene lists: %.1f%%' % (gx/gtot)); gx += 100.0 for go in self.dict[gokey][gene]: if gokey in self.dict['GO'][go]: self.dict['GO'][go][gokey].append(gene) else: self.dict['GO'][go][gokey] = [gene]; ix += 1 self.printLog('\r#GENES','Making GO Gene lists complete: %s GO IDs with genes' % rje.integerString(ix)) except: self.errorLog('Major problem with GO.makeGOGenes()')
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = self.obj['SeqList'] if self.getStr('Basefile').lower() in ['','none']: self.str['Basefile'] = rje.baseFile(seqlist.getStr('Name')) self.obj['DB'].setInfo({'Basefile':self.str['Basefile']}) ## ~ [1a] Genetic Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cdb = self.db().addEmptyTable('Code',['Codon','AA'],['Codon']) for codon in rje_sequence.genetic_code: cdb.addEntry({'Codon':codon,'AA':rje_sequence.genetic_code[codon]}) cdb.index('AA') ### ~ [2] Calculate Codon Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### codons = rje.sortKeys(rje_sequence.genetic_code) db = self.db().addEmptyTable('Codons',['Seq','Len']+codons,['Seq']) sx = 0.0; seqx = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#COD','Calculating codon usage: %.2f%%' % (sx/seqx)); sx += 100.0 entry = rje_sequence.codons(seq.getSequence(),{}) #self.deBug(entry); self.deBug(entry.values()) entry['Len'] = sum(entry.values()) entry['Seq'] = seq.getStr('AccNum') db.addEntry(entry) self.printLog('\r#COD','Codon usage calculated for %s sequences' % rje.iStr(seqx)) db.fillBlanks(blank=0,fillempty=True) db.saveToFile() ### ~ [3] Calculate NT Count Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nt = ['C','A','G','U'] for i in [1,2,3]: for n in ['C','A','G','U']: nt.append('%s|%d' % (n,i)) ndb = self.db().addEmptyTable('NT',['Seq','Len']+nt,['Seq']) sx = 0.0; seqx = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#NT','Calculating NT Counts: %.2f%%' % (sx/seqx)); sx += 100.0 entry = rje_sequence.aaFreq(string.replace(seq.getSequence(),'T','U'),{'C':0,'A':0,'G':0,'U':0},False) entry['Len'] = sum(entry.values()) entry['Seq'] = seq.getStr('AccNum') centry = db.data(entry['Seq']) for i in [1,2,3]: for n in ['C','A','G','U']: entry['%s|%d' % (n,i)] = 0 for codon in codons: for i in [1,2,3]: n = codon[i-1] entry['%s|%d' % (n,i)] += centry[codon] ndb.addEntry(entry) self.printLog('\r#NT','NT Counts calculated for %s sequences' % rje.iStr(seqx)) ndb.saveToFile() except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def loadHHPID(self): ### Load HHPID interactions '''Load HHPID interactions.''' try:### ~ [1] Setup HHPID Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStr('HHPID').lower() in ['','none']: return self.printLog('#HHPID','No HHPID file to load') hdb = self.db().addTable(self.getStr('HHPID'),mainkeys='auto',datakeys='All',name='HHPID') for field in ['#Tax ID 1','Tax ID 2','product accession.version 2','last update timestamp']: hdb.dropField(field) hdb.renameField('Gene ID 1','EntrezHIV') hdb.renameField('product accession.version 1','AccHIV') hdb.renameField('product name 1','HIV') hdb.renameField('Interaction short phrase','Interaction') hdb.renameField('Gene ID 2','Entrez') hdb.renameField('product name 2','Description') hdb.renameField('PubMed ID (PMID) list','PMID') for itype in rje.sortKeys(hdb.index('Interaction')): self.printLog('#HHPID','%s => %s entries' % (itype, len(hdb.index('Interaction')[itype]))) hdb.dropEntriesDirect('Interaction',['binds','complexes with','interacts with'],inverse=True) return True except: self.errorLog('%s.loadHHPID error' % self)
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt' self.progLog('\r#PPI','Loading pairwise data...') pairwise = rje.dataDict(self,ppipairwise,['Hub','Spoke'],['Spoke','SpokeSeq','Evidence']) gene2seq = {}; seq2gene = {} fullppi = {}; px = 0.0; ptot = len(pairwise); ppix = 0 for pair in rje.sortKeys(pairwise): self.progLog('\r#PPI','Processing full pairwise PPI: %.2f%%' % (px/ptot)); px += 100.0 [hub,spoke] = string.split(pair,'\t') if spoke not in gene2seq: sseq = pairwise[pair]['SpokeSeq'] gene2seq[spoke] = sseq; seq2gene[string.split(sseq,'__')[0]] = spoke if hub not in fullppi: fullppi[hub] = {} if spoke not in fullppi[hub]: fullppi[hub][spoke] = pairwise.pop(pair)['Evidence']; ppix += 1 self.printLog('\r#PPI','Processed full pairwise PPI: %s genes; %s ppi.' % (rje.integerString(len(fullppi)),rje.integerString(ppix/2))) ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt' goodppi = self.loadFromFile(goodppifile,chomplines=True) self.dict['PPI'] = {} px = 0.0; ptot = len(fullppi); fppix = ppix; ppix = 0 for hub in fullppi: self.progLog('\r#PPI','Filtering complexes: %.2f%% (%s hubs; %s ppi)' % (px/ptot,rje.integerString(len(self.dict['PPI'])),rje.integerString(ppix))); px +=100.0 self.dict['PPI'][hub] = [] for spoke in fullppi[hub]: goodspoke = False for ptype in goodppi: if rje.matchExp(':(%s)($|\|)' % ptype, fullppi[hub][spoke]): goodspoke = True; break if goodspoke: self.dict['PPI'][hub].append(spoke); continue goodspoke = True for spoke2 in fullppi[hub]: if spoke2 in [hub,spoke]: continue if spoke2 in fullppi[spoke]: goodspoke = False; break if goodspoke: self.dict['PPI'][hub].append(spoke) ppix += len(self.dict['PPI'][hub]) if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub) self.printLog('\r#PPI','Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' % (rje.integerString(len(fullppi)),rje.integerString(len(self.dict['PPI'])),rje.integerString(fppix/2),rje.integerString(ppix/2))) ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas' scmd = ['accnr=F','seqnr=F','seqin=%s' % seqfile] + self.cmd_list + ['autoload=T'] seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,scmd) self.dict['SeqObj'] = seqlist.seqNameDic('Max') self.dict['Gene2Seq'] = gene2seq; self.dict['Seq2Gene'] = seq2gene return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def readSLiMSearch(self): ### Reads SLiMSearch results into data dictionary '''Reads SLiMSearch results into data dictionary.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sumfile = '%s.summary.csv' % self.info['ResFile'] occfile = '%s.csv' % self.info['ResFile'] if not os.path.exists(sumfile): return self.errorLog('No Summary file "%s"!' % sumfile,printerror=False) if not os.path.exists(occfile): return self.errorLog('No Occurrence file "%s"!' % occfile,printerror=False) ### ~ [2] Read Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### esum = rje.dataDict(self,sumfile,mainkeys=['Motif'],datakeys='All',getheaders=False) occmotifs = [] # List of motifs with enough occurrences for motif in rje.sortKeys(esum): if string.atoi(esum[motif]['N_Occ']) < self.stat['MinOcc']: continue occmotifs.append(motif) ### ~ [3] Read Occurrences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#MOTIF','%d motifs with N_Occ >= MinOcc (%d)' % (len(occmotifs),self.stat['MinOcc'])) self.readSLiMSearchOcc(occmotifs) except: self.log.errorLog(rje_zen.Zen().wisdom())
def expectedCodonUsage(self): ### Calculate expected codon usage from Frequency data '''Calculate expected codon usage from Frequency data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### aacode = self.db('Code').index('AA') nt = ['C','A','G','U']; codons = rje.sortKeys(rje_sequence.genetic_code) cdb = self.db('Codons'); ndb = self.db('NT') nsumdb = self.db().copyTable(ndb,'NTPos',replace=True) nsumdb.dropField('Len') for n in ['C','A','G','U']: nsumdb.renameField(n,'%s|All' % n) nsumdb.reshapeLong('Pos',reshape=['C','A','G','U']) nsumdb.compress(['Pos'],{'Pos':'str','Seq':'str'},default='sum') nsumdb.dropField('Seq'); nsumdb.addField('Total') for entry in nsumdb.entries(): pos = entry.pop('Pos'); entry.pop('Total') rje.dictFreq(entry) entry['Pos'] = pos nsumdb.saveToFile() nexentry = nsumdb.data('3') fdb = self.db().addEmptyTable('Freq',['Seq','Len']+nt+codons+['Total'],['Seq']) edb = self.db().copyTable(cdb,'Expected',replace=True) ### ~ [2] Calculate Frequencies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### x = 0.0; etot = cdb.entryNum() for oldentry in cdb.entries(): self.progLog('\r#FREQ','Calculating Frequencies: %.2f%%' % (x/etot)); x += 100.0 entry = rje.combineDict({},oldentry) seq = entry['Seq']; entry['Total'] = entry.pop('Len') exentry = edb.data(seq) ntentry = rje.combineDict({},ndb.data()[seq]) ntentry.pop('Seq'); ntentry.pop('Len') rje.dictFreq(ntentry) ntentry['Len'] = ntentry.pop('Total') for aa in aacode: ax = 0.0; ex = 0.0 for codon in aacode[aa]: ax += entry[codon] exentry[codon] = nexentry[codon[0]] * nexentry[codon[1]] * nexentry[codon[2]] ex += exentry[codon] for codon in aacode[aa]: if ax: entry[codon] = len(aacode[aa]) * entry[codon] / ax else: entry[codon] = 0.0 exentry[codon] = ax * (exentry[codon] / ex) fdb.addEntry(rje.combineDict(entry,ntentry)) self.printLog('\r#Freq','Frequencies calculated for %s entries' % rje.iStr(etot)) fdb.saveToFile(); edb.saveToFile() except: self.errorLog('%s.expectedCodonUsage error' % self)
def reduceGO(self): ### Reduce GO terms to those with enough sequences '''Reduce GO terms to those with enough sequences.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### go = self.obj['GO'] minocc = self.stat['MinOcc'] maxocc = self.stat['MaxGenes'] gokey = 'EnsGO' ### ~ [2] ~ Reduce GO ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for id in rje.sortKeys(go.dict['GO']): try: idgenes = go.getGOGenes(id,gokey) self.deBug('%s: %s' % (id,idgenes)) if len(idgenes) < minocc or len(idgenes) > maxocc: # Remove go.dict['GO'].pop(id) for gene in idgenes: go.dict[gokey][gene].remove(id) if not go.dict[gokey][gene]: go.dict[gokey].pop(gene) except: self.errorLog('GOER.reduceGO(%s) problem' % id) except: self.errorLog(rje_zen.Zen().wisdom())
def topTerms(self,slimx=20,parents=False,total='Total',countkey='counts'): ### Selects top terms for GO slim set ''' Selects top terms for GO slim set. >> slimx:int [20] = Desired min. number of terms for each GO domain. >> parents:bool [False] = Whether parents and children both allowed in list >> total:str ['Total'] = Sample containing Total counts for assessment >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample - self.go(id)[countkey] = {Sample:count} << returns a list of GO IDs that meet criteria ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #x#self.opt['DeBug'] = True terms = [] # List of terms dom = {'cc':{},'bp':{},'mf':{}} # Dictionary of {domain:{count:[IDs]}} for id in self.go(): n = self.go(id)[countkey][total] type = self.go(id)['type'] if n not in dom[type]: dom[type][n] = [id] else: dom[type][n].append(id) ### ~ [2] ~ Generate Top Terms ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.deBug(dom) for type in dom: dterms = [] # Terms for this domain only dkeys = rje.sortKeys(dom[type]) # Counts, low to high dkeys.reverse() # Counts, high to low (dx,dtot) = (0.0,len(dkeys)) while dkeys and len(dterms) < slimx: # Keep looping self.deBug('%s: %s' % (type,dterms)) self.progLog('#TOP','Generating top %d %s terms: %.1f%%' % (slimx,type,dx/dtot)) dx += 100.0 n = dkeys.pop(0) # Remove from list dterms += dom[type][n] # Add terms to term list if parents: continue # Don't care if parents and children all mixed up for id in dterms[0:]: if id not in dterms: continue # Previously-removed parent for par in self.parents(id): # Check all parents if par in dterms: dterms.remove(par) # Remove parent term self.printLog('\r#TOP','Identified %s top %s terms: >= %s genes' % (rje.iLen(dterms),type,rje.iStr(n))) terms += dterms # Found a stable list of terms self.deBug(terms) return terms except: self.errorLog('Major problem with GO.topTerms()') return []
def output(self, seq, outfile, occdata=[]): ### Output to file '''Output to file.''' try: ### ~ [1] ~ Basic Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['OccOnly'] and not occdata: return odata = [ 'Name\t%s' % (seq.shortName()), 'Sequence\t%s' % (seq.getSequence(gaps=False)), 'Output\t%s' % (string.join(string.split(outfile, '.')[:-1], '.')), 'RE\t%s' % (string.join(self.list['PlotRE'], ',')), 'TrueELMs\tY', 'Description\t%s' % (seq.info['Description']), '', ] ### ~ [2] ~ PlotStats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for plot in rje.sortKeys(seq.dict['PlotStat']): odata.append( 'Plot\t%s\t%s' % (plot, string.join(seq.dict['PlotStat'][plot], ', '))) if seq.dict['PlotStat']: odata.append('') ### ~ [3] ~ PlotFT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if seq.obj['Entry']: for ft in seq.obj['Entry'].list['Feature']: if ft['Type'] in self.list['PlotFT']: odata.append( 'Region\t%s %s\t%s:%s' % (ft['Type'], ft['Desc'], ft['Start'], ft['End'])) odata.append('') ### ~ [4] ~ MotifOcc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if occdata: for occ in occdata: odata.append( 'Motif\t%s\t%s:%s' % (occ['Pattern'], occ['Start_Pos'], occ['End_Pos'])) ### ~ [5] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### open(outfile, 'w').write(string.join(odata, '\n')) self.printLog('#PLOT', 'SeqPlot output saved as %s' % (outfile)) except: self.errorLog(rje_zen.Zen().wisdom())
def checkSeqList(self): ### Check sequence integrity '''Check sequence integrity.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqdict = self.dict['SeqDict'] ### ~ [2] Check PPI data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### missing = [] self.progLog('\r#MISS', 'Checking PPI IDs: %d missing' % len(missing)) for p in rje.sortKeys(self.dict['PPI']): if p not in seqdict: missing.append(p) self.progLog('\r#MISS', 'Checking PPI IDs: %d missing' % len(missing)) self.printLog( '\r#MISS', 'Checking PPI IDs complete: %d missing' % len(missing)) open('yeast.ppi.missing.txt', 'w').write(string.join(missing, '\n')) ### ~ [3] Check Pillar data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### missing = [] self.progLog('\r#MISS', 'Checking Pillar IDs: %d missing' % len(missing)) for pillar in self.list['Pillars']: for p in pillar: if p not in seqdict: missing.append(p) self.progLog( '\r#MISS', 'Checking Pillar IDs: %d missing' % len(missing)) self.printLog( '\r#MISS', 'Checking Pillar IDs complete: %d missing' % len(missing)) open('yeast.pillar.missing.txt', 'w').write(string.join(missing, '\n')) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible