def readSLiMSearchOcc(self,motifs=[]): ### Reads SLiMSearch results into data dictionary '''Reads SLiMSearch results into data dictionary.''' try:### ~ [1] Read ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not motifs: self.printLog('#OCC','Cannot process occurrences for No motifs!') occfile = '%s.csv' % self.info['ResFile'] delimit = rje.delimitFromExt(filename=occfile) data = rje.dataDict(self,occfile,mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=string.split('Seq,Desc,Start_Pos,End_Pos,Cons,HomNum,GlobID,LocID,Hyd,SA',',')) self.dict['Occ'] = {} ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (mx,ox,otot) = (0,0.0,len(data)) for occ in data: self.progLog('\r#OCC','Processing occurrences (%d motifs): %.2f%%' % (mx,ox/otot)); ox += 100.0 #x#self.deBug('%s vs MinHom %d' % (data[occ],self.stat['MinHom'])) if string.atoi(data[occ]['HomNum']) < self.stat['MinHom']: continue (motif,seq,start,end) = string.split(occ,delimit) if motif not in motifs: continue try: gene = rje.matchExp('gene:(\S+)\]',data[occ]['Desc'])[0] self.deBug('%s:%s' % (gene,self.ensGO(gene))) if not self.ensGO(gene): continue except: continue if motif[-3:] == 'rev': (motif,type) = (motif[:-4],'Rev') elif motif[-5:] == 'scram': (motif,type) = (motif[:-6],'Scr') else: type = 'ELM' if motif not in self.dict['Occ']: self.dict['Occ'][motif] = {}; mx += 1 if type not in self.dict['Occ'][motif]: self.dict['Occ'][motif][type] = {} if gene not in self.dict['Occ'][motif][type]: self.dict['Occ'][motif][type][gene] = [] self.dict['Occ'][motif][type][gene].append(data[occ]) self.printLog('\r#OCC','Processed %s occurrences: %d motifs with GO-links' % (rje.integerString(otot),mx)) except: self.log.errorLog(rje_zen.Zen().wisdom())
def run(self,setup=True): ### Main Run Method ''' Main Run Method >> setup:bool [True] = Sets up headers and reads in existing data if present. ''' try: ### ~ Setup & Read existing data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if setup: self.setup() headers = self.list['Headers'] delimit = rje.delimitFromExt(filename=self.info['CardOut']) if os.path.exists(self.info['EnsLoci']): for h in ['EnsLoci','EnsDesc']: if h not in headers: headers.append(h) rje.delimitedFileOutput(self,self.info['CardOut'],headers,delimit,rje_backup=True) ### ~ Read EnsLoci for incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.ensLoci() ### ~ Parse data from GeneCards website and/or previously read aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.processGenes(self.list['Genes']) self.interactiveUpdate() ### ~ Add EnsEMBL EnsLoci data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.addEnsLoci() ### ~ Output GeneCards data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.outputCards() except: self.log.errorLog('Apocalyptic error with GeneCards.run()') raise
def outputCards(self): ### Outputs cards to delimited file '''Outputs cards to delimited file.''' ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### genelist = self.list['Genes'] if self.opt['Purify'] and self.opt['Restrict']: for gene in genelist[0:]: if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: # Replace with symbol genelist.remove(gene) if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol']) delimit = rje.delimitFromExt(filename=self.info['CardOut']) CARDOUT = open(self.info['CardOut'],'a') ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (noens,noloci,ox) = (0,0,0) for gene in rje.sortKeys(self.dict['GeneCard']): if self.opt['Restrict'] and gene not in genelist: continue elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1 self.dict['GeneCard'][gene]['Alias'] = gene self.dict['GeneCard'][gene]['Species'] = self.info['Species'] rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene]) if self.dict['GeneCard'][gene]['Symbol'] == gene: # Not an alias if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1 if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1 CARDOUT.close() self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut'])) self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))
def loadFeatures(self,ftfile): ### Loads features from given file '''Loads features from given file.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if ftfile in ['','none']: return if not os.path.exists(ftfile): return self.printLog('#ERR','Features file "%s" missing') delimit = rje.delimitFromExt(filename=ftfile) ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = rje.readDelimit(open(ftfile,'r').readline(),delimit) mainkeys = [headers[0]] hmap = {} for h in headers: hmap[h.lower()] = h pos = '' # Leader for start/end positions if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_' for h in ['feature','%sstart' % pos,'%send' % pos,'description']: if h not in hmap: return self.printLog('#ERR','No %s field detected in "%s" features file' % (h,ftfile)) mainkeys.append(hmap[h]) mainkeys.remove(hmap['description']) ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ftdata = rje.dataDict(self,ftfile,mainkeys,['description'],delimit,headers,lists=True) (mx,mtot,fx) = (0.0,len(ftdata),0) for mainkey in rje.sortKeys(ftdata): self.progLog('\r#FT','Loading features from %s: %.2f%%' % (ftfile,mx/mtot)) mx += 100.0 (id,ft,start,end) = string.split(mainkey,delimit) if id == mainkeys[0]: continue if id not in self.dict['Features']: self.dict['Features'][id] = [] for desc in ftdata[mainkey][hmap['description']]: fx += 1 self.dict['Features'][id].append({'Type':ft,'Start':int(start),'End':int(end),'Desc':desc}) self.printLog('\r#FT','Loaded %s features for %s IDs from %s' % (rje.integerString(fx),rje.integerString(len(self.dict['Features'])),ftfile)) except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
def loadFeatures(self, ftfile): ### Loads features from given file '''Loads features from given file.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if ftfile in ['', 'none']: return if not os.path.exists(ftfile): return self.printLog('#ERR', 'Features file "%s" missing') delimit = rje.delimitFromExt(filename=ftfile) ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = rje.readDelimit(open(ftfile, 'r').readline(), delimit) mainkeys = [headers[0]] hmap = {} for h in headers: hmap[h.lower()] = h pos = '' # Leader for start/end positions if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_' for h in [ 'feature', '%sstart' % pos, '%send' % pos, 'description' ]: if h not in hmap: return self.printLog( '#ERR', 'No %s field detected in "%s" features file' % (h, ftfile)) mainkeys.append(hmap[h]) mainkeys.remove(hmap['description']) ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ftdata = rje.dataDict(self, ftfile, mainkeys, ['description'], delimit, headers, lists=True) (mx, mtot, fx) = (0.0, len(ftdata), 0) for mainkey in rje.sortKeys(ftdata): self.progLog( '\r#FT', 'Loading features from %s: %.2f%%' % (ftfile, mx / mtot)) mx += 100.0 (id, ft, start, end) = string.split(mainkey, delimit) if id == mainkeys[0]: continue if id not in self.dict['Features']: self.dict['Features'][id] = [] for desc in ftdata[mainkey][hmap['description']]: fx += 1 self.dict['Features'][id].append({ 'Type': ft, 'Start': int(start), 'End': int(end), 'Desc': desc }) self.printLog( '\r#FT', 'Loaded %s features for %s IDs from %s' % (rje.integerString(fx), rje.integerString(len(self.dict['Features'])), ftfile)) except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Basefile'].lower() in ['','none']: self.info['Basefile'] = '' elif self.info['Basefile'][-1] != '.': self.info['Basefile'] += '.' self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T']) self.list['PlotFT'] = string.split(string.join(self.list['PlotFT']).upper()) if self.info['OccFile'].lower() not in ['','none']: self.info['Delimit'] = rje.delimitFromExt(filename=self.info['OccFile']) self.dict['OccData'] = {} occdata = rje.dataDict(self,self.info['OccFile'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'],['Seq','Dataset','Pattern','Start_Pos','End_Pos']) for key in rje.sortKeys(occdata): seq = occdata[key].pop('Seq') if seq not in self.dict['OccData']: self.dict['OccData'][seq] = {} dataset = occdata[key].pop('Dataset') if dataset not in self.dict['OccData'][seq]: self.dict['OccData'][seq][dataset] = [] self.dict['OccData'][seq][dataset].append(occdata[key]) self.printLog('#OCC','Loaded data for %s occurrences in %s sequences' % (rje.integerString(len(occdata)),rje.integerString(len(self.dict['OccData'])))) self.obj['SeqList'].autoFilter(['GoodSeq=%s' % string.join(rje.sortKeys(self.dict['OccData']),',')]) ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['PlotStat'] = string.split(string.join(self.list['PlotStat']).lower()) if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc = rje_slimcalc.SLiMCalc(self.log,self.cmd_list) seqdict = self.obj['SeqList'].seqNameDic() for name in rje.sortKeys(seqdict): if self.opt['OccOnly'] and not name in self.dict['OccData']: continue seq = seqdict[name] sequence = seq.getSequence(gaps=False) seq.dict['PlotStat'] = {} if 'sa' in self.list['PlotStat']: seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(sequence,returnlist=True) if 'hyd' in self.list['PlotStat']: seq.dict['PlotStat']['Hydropathy'] = rje_seq.eisenbergHydropathy(sequence,returnlist=True) if 'dis' in self.list['PlotStat']: seq.dict['PlotStat']['Disorder'] = seq.disorder(returnlist=True) if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc.relConListFromSeq(seq,slimcalc.stat['RelConWin'],store=True) try: seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons') seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop('RelCons') except: self.printLog('#CONS','No conservation stats for %s' % name) self.printLog('#STAT','PlotStats calculated for %s' % name) for stat in seq.dict['PlotStat']: if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0: seq.dict['PlotStat'][stat] = self.plotWin(seq.dict['PlotStat'][stat]) seq.dict['PlotStat'][stat] = self.convertStat(seq.dict['PlotStat'][stat]) self.printLog('#STAT','PlotStats converted for %s' % name) ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if name in self.dict['OccData']: for dataset in self.dict['OccData'][name]: ofile = '%s%s.%s.plot.txt' % (self.info['Basefile'],dataset,seq.info['AccNum']) self.output(seq,ofile,self.dict['OccData'][name][dataset]) else: self.output(seq,'%s%s.plot.txt' % (self.info['Basefile'],seq.info['AccNum'])) return except: self.errorLog(rje_zen.Zen().wisdom())
def restOutput(self,outfmt=None,maxparsesize=0,asjson=False): ### Returns rest output for outfmt '''Returns rest output for outfmt.''' if not outfmt: outfmt = self.getStrLC('Rest') if not outfmt: self.jsonText('No REST output',asjson) if outfmt in self.dict['Output']: rfile = string.split(self.dict['Output'][outfmt],'\n')[0] if rje.exists(rfile): fext = string.split(rfile,'.')[-1] if fext in ['png']: self.debug(rfile) self.jsonText(rfile,asjson) nbytes = os.path.getsize(rfile) if nbytes > maxparsesize > 0: # Too large to parse otext = '%s is too large to return (%s > %s)' % (os.path.basename(rfile),rje.humanByteSize(nbytes),rje.humanByteSize(maxparsesize)) try: jobid = self.dict['Output']['jobid'] except: jobid = None resturl = '%sretrieve&jobid=%s&rest=%s[&password=X]' % (self.getStr('RestURL'),jobid,outfmt) if not jobid or outfmt == self.getStrLC('Rest'): return self.jsonText('ERROR: %s' % (otext),asjson) else: return self.jsonText('%s in full output. Try %s.' % (otext,resturl),asjson) else: delimit = rje.delimitFromExt(filename=rfile,write=False) if asjson and delimit in [',','\t']: jtext = [] for rline in open(rfile,'r').readlines(): jtext.append(json.dumps(rje.readDelimit(rline,delimit))) return '[%s]' % string.join(jtext,',\n ') #!# Add json parsing of fasta files? else: outtxt = open(rfile,'r').read() if not outtxt.endswith('\n'): outtxt += '\n' return self.jsonText(outtxt,asjson) elif asjson and outfmt in self.dict['Outfile']: pass #!# Sort out json formatting here based on file extension! return self.dict['Output'][outfmt] elif outfmt in ['parse','format']: intro = '<pre>%s</pre>\n\n' % self.restOutput('intro') return self.jsonText(intro,asjson) elif outfmt in ['default','full']: return self.jsonText(self.restFullOutput(maxparsesize),asjson) elif outfmt in ['restkeys','outputs']: return string.join(self.list['RestKeys']+[''],'\n') return self.jsonText('No %s output generated.' % outfmt,asjson)
def mapPhosByBLAST(self,fasfile): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + ['seqin=%s' % fasfile,'autoload=T','autofilter=F'] qseqlist = rje_seq.SeqList(self.log,scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['','none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name','Pos','AA','PELM','PELMPos','Evidence'] delimit = rje.getDelimit(self.cmd_list,rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self,ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log,self.cmd_list+['formatdb=F']) pblast.setInfo({'Name':'%s.p.blast' % rje.baseFile(fasfile),'DBase':self.info['PELMFas'],'InFile':fasfile}) pblast.setStat({'HitAln':pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID','Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0,self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True,log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F','autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq:0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys(self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False,gaps=False) != hseq.getSequence(case=False,gaps=False): self.log.errorLog('Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec']) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0 aln.muscleAln() #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])) aln._addSeq('PhosAln','-' * qseq.seqLen()) aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']) ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## print '>>>\n', aln.seq, pdict.keys(), rdict.keys() for a in range(qseq.seqLen()): if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1 for hseq in pdict: if hseq.info['Sequence'][a] == '-': continue if hseq != qseq: rdict[hseq] += 1 if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][a] == hseq.info['Sequence'][a]: # Phosphosite pdata = {'Name':search.info['Name'],'Pos':rdict[qseq],'AA':qseq.info['Sequence'][a], 'PELM':hseq.shortName(),'PELMPos':rdict[hseq],'Evidence':'Hom'} if hseq == qseq: pdata['Evidence'] = 'Self' elif hseq in idlist: pdata['Evidence'] = 'ID' rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,pdata) self.addPhos(aln.seq[-1],a,pdata['Evidence']) ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## aln.saveFasta() # Align hits for each > X %ID # Map phosphosites onto alignment and output # return except: self.log.errorLog('Problem during PhosphoSeq.mapPhosByBLAST')
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Basefile'].lower() in ['', 'none']: self.info['Basefile'] = '' elif self.info['Basefile'][-1] != '.': self.info['Basefile'] += '.' self.obj['SeqList'] = rje_seq.SeqList( self.log, self.cmd_list + ['autoload=T']) self.list['PlotFT'] = string.split( string.join(self.list['PlotFT']).upper()) if self.info['OccFile'].lower() not in ['', 'none']: self.info['Delimit'] = rje.delimitFromExt( filename=self.info['OccFile']) self.dict['OccData'] = {} occdata = rje.dataDict( self, self.info['OccFile'], ['Seq', 'Dataset', 'Pattern', 'Start_Pos', 'End_Pos'], ['Seq', 'Dataset', 'Pattern', 'Start_Pos', 'End_Pos']) for key in rje.sortKeys(occdata): seq = occdata[key].pop('Seq') if seq not in self.dict['OccData']: self.dict['OccData'][seq] = {} dataset = occdata[key].pop('Dataset') if dataset not in self.dict['OccData'][seq]: self.dict['OccData'][seq][dataset] = [] self.dict['OccData'][seq][dataset].append(occdata[key]) self.printLog( '#OCC', 'Loaded data for %s occurrences in %s sequences' % (rje.integerString(len(occdata)), rje.integerString(len(self.dict['OccData'])))) self.obj['SeqList'].autoFilter([ 'GoodSeq=%s' % string.join(rje.sortKeys(self.dict['OccData']), ',') ]) ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['PlotStat'] = string.split( string.join(self.list['PlotStat']).lower()) if 'cons' in self.list['PlotStat'] or 'rel' in self.list[ 'PlotStat']: slimcalc = rje_slimcalc.SLiMCalc(self.log, self.cmd_list) seqdict = self.obj['SeqList'].seqNameDic() for name in rje.sortKeys(seqdict): if self.opt['OccOnly'] and not name in self.dict['OccData']: continue seq = seqdict[name] sequence = seq.getSequence(gaps=False) seq.dict['PlotStat'] = {} if 'sa' in self.list['PlotStat']: seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility( sequence, returnlist=True) if 'hyd' in self.list['PlotStat']: seq.dict['PlotStat'][ 'Hydropathy'] = rje_seq.eisenbergHydropathy( sequence, returnlist=True) if 'dis' in self.list['PlotStat']: seq.dict['PlotStat']['Disorder'] = seq.disorder( returnlist=True) if 'cons' in self.list['PlotStat'] or 'rel' in self.list[ 'PlotStat']: slimcalc.relConListFromSeq(seq, slimcalc.stat['RelConWin'], store=True) try: seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons') seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop( 'RelCons') except: self.printLog('#CONS', 'No conservation stats for %s' % name) self.printLog('#STAT', 'PlotStats calculated for %s' % name) for stat in seq.dict['PlotStat']: if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0: seq.dict['PlotStat'][stat] = self.plotWin( seq.dict['PlotStat'][stat]) seq.dict['PlotStat'][stat] = self.convertStat( seq.dict['PlotStat'][stat]) self.printLog('#STAT', 'PlotStats converted for %s' % name) ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if name in self.dict['OccData']: for dataset in self.dict['OccData'][name]: ofile = '%s%s.%s.plot.txt' % ( self.info['Basefile'], dataset, seq.info['AccNum']) self.output(seq, ofile, self.dict['OccData'][name][dataset]) else: self.output( seq, '%s%s.plot.txt' % (self.info['Basefile'], seq.info['AccNum'])) return except: self.errorLog(rje_zen.Zen().wisdom())
def mapPhosByBLAST( self, fasfile ): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + [ 'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F' ] qseqlist = rje_seq.SeqList(self.log, scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['', 'none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence'] delimit = rje.getDelimit( self.cmd_list, rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self, self.info['PhosRes'], headers, delimit, rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self, ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log, self.cmd_list + ['formatdb=F']) pblast.setInfo({ 'Name': '%s.p.blast' % rje.baseFile(fasfile), 'DBase': self.info['PELMFas'], 'InFile': fasfile }) pblast.setStat({'HitAln': pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID', 'Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True, log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList( self.log, self.cmd_list + ['autoload=F', 'autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq: 0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys( self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False, gaps=False) != hseq.getSequence( case=False, gaps=False): self.log.errorLog( 'Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec'] ) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0
def picsi(self): ### Cleans up cross-species search results '''Cleans up cross-species search results.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datafile = self.info['SumFile'] delimit = rje.delimitFromExt(filename=self.info['SumFile']) data = {} # search:{hit:{???}} pep2prot = {} # search:{peptide:[hits]} id2prot = {} # search:{id:hit} prot2desc = {} fullpeplist = {} pepcon = {} # Convert pep:longer pep speclist = [] # List of species codes ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True) for ikey in rje.sortKeys(indata): (search,id) = string.split(ikey,delimit) prot = indata[ikey]['prot_acc'][0] desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','') if desc[3:7] == 'Name': desc = desc[9:] prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc)) indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|') pepconv = string.replace(indata[ikey]['pep_seq'],'I','L') pepconv = string.replace(pepconv,'Q','K') peplist = rje.sortUnique(string.split(pepconv,'|')) indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|') if search not in data: data[search] = {} pep2prot[search] = {} id2prot[search] = {} fullpeplist[search] = [] pepcon[search] = {} fullpeplist[search] += peplist id2prot[search][id] = prot spec = string.split(prot,'_')[1] if spec not in speclist: speclist.append(spec) data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec, 'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:], 'pep_rem':0} try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum'] except: data[search][prot]['accnum'] = string.split(prot,'__')[-1] for pep in peplist: if pep not in pep2prot[search]: pep2prot[search][pep] = [] pep2prot[search][pep].append(prot) ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for search in fullpeplist: fullpeplist[search] = rje.sortUnique(fullpeplist[search]) for pep in fullpeplist[search][0:]: for pep2 in fullpeplist[search]: if pep != pep2 and pep in pep2: pepcon[search][pep] = pep2 fullpeplist[search].remove(pep) break for pep in pepcon[search]: while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]] self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search)) #self.deBug(pepcon[search]) #self.deBug(rje.sortKeys(pep2prot[search])) pp = 0; pm = 0 for prot in data[search]: for pep in data[search][prot]['conpep'][0:]: if pep in pepcon[search]: newpep = pepcon[search][pep] if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1 data[search][prot]['conpep'].remove(pep); pm += 0 if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot) if pep in pep2prot[search]: pep2prot[search].pop(pep) data[search][prot]['pep_con'] = len(data[search][prot]['conpep']) self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm)) ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for search in pep2prot: ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## remx = 0 for prot in data[search]: if data[search][prot]['spec'] != self.info['QrySpec']: continue for pep in data[search][prot]['conpep']: for prot2 in pep2prot[search][pep][0:]: if data[search][prot2]['spec'] == self.info['QrySpec']: continue pep2prot[search][pep].remove(prot2) data[search][prot2]['conpep'].remove(pep) data[search][prot2]['pep_rem'] += 1; remx += 1 self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec'])) ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for prot in data[search]: if len(data[search][prot]['conpep']) < 2: for pep in data[search][prot]['conpep']: #if pep in pep2prot[search] and prot in pep2prot[search][pep]: pep2prot[search][pep].remove(prot) ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ux = 0 for pep in pep2prot[search]: #self.deBug(pep) if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1 self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search)) ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0} rx = 0 for prot in data[search]: #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue data[search][prot]['pep_red'] = 0 # Redundant peptides found in proteins with unique peptides data[search][prot]['pep_nr'] = 0 # Redundant peptides found only in proteins without unique peptides for pep in data[search][prot]['conpep']: if pep2prot[search][pep] == [prot]: continue upep = False for prot2 in pep2prot[search][pep]: if data[search][prot2]['pep_uniq']: upep = True; break if upep: data[search][prot]['pep_red'] += 1 # Redundant peptide found in unique protein else: data[search][prot]['pep_nr'] += 1 # Redundant peptide NOT found in unique protein if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1 elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE' elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR' else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1 summary[data[search][prot]['class']] += 1 self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search)) for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x)) ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### speclist.sort() species = {} for spec in speclist: try: grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read() species[spec] = string.split(grep,':')[-4] self.printLog('#SPEC','%s = %s' % (spec,species[spec])) except: species[spec] = '?' ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile']) headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep'] if self.dict['Acc2Seq']: headers.insert(3,'cluster') rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True) for search in rje.sortKeys(data): if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search]) for prot in rje.sortKeys(data[search]): if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']): data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1] else: data[search][prot]['species'] = species[data[search][prot]['spec']] rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot]) except: self.errorLog('Errg')