def sgd2sp( self ): ### Reformats yeast sequence names and outputs new data for GOPHER '''Reformats yeast sequence names and outputs new data for GOPHER.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### inseq = self.obj['SeqList'] uni = rje_uniprot.UniProt(self.log, self.cmd_list + ['datout=None']) xref = self.db('XRef') self.dict['Rename'] = {} ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ufile = '%s.dat' % self.info['Basefile'] if os.path.exists(ufile) and not self.opt['Force']: uni.readUniProt(ufile, clear=True, cleardata=False) else: uni.readUniProt(clear=True, acclist=rje.sortKeys(xref.index('UniProt')), cleardata=False) uni.saveUniProt(ufile) ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniseq = {} for entry in uni.entries(): seq = entry.obj['Sequence'] uniseq[seq.info['AccNum']] = seq self.printLog( '\r#USEQ', '%s UniProt Sequences extracted (%s Ensembl AccNum)' % (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt'))))) ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### yseq = [] # List of YEAST sequence objects (sx, stot) = (0.0, inseq.seqNum()) for seq in inseq.seqs(): self.progLog( '\r#SEQ', 'Reformatting sequence names: %.2f%%' % (sx / stot)) sx += 100.0 if seq.info['SpecCode'] != 'YEAST': continue yseq.append(seq) sgd = seq.info['AccNum'] newname = seq.info['Name'] try: for x in xref.indexEntries('EnsG', sgd): acc = x['UniProt'] if acc: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % ( seq.info['Name'], x['Gene'], x['EnsG'], x['SGD'], acc) else: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % ( seq.info['Name'], x['Gene'], x['EnsG'], x['SGD']) continue if acc not in uniseq: self.printLog( '\r#UNIERR', 'Unable to find UniProt sequence %s (%s)' % (acc, sgd)) continue useq = uniseq[acc] if useq.info['Sequence'] != seq.info['Sequence']: self.printLog( '\r#SEQERR', '%s sequence <> %s sequence' % (sgd, acc)) continue nsplit = string.split(newname) nsplit[0] = '%s__%s' % (x['UniprotID'], acc) newname = string.join(nsplit) self.dict['Rename'][sgd] = acc break except: self.errorLog('%s problem' % sgd) seq.info['Name'] = newname seq.extractDetails(gnspacc=True) self.printLog('\r#SEQ', 'Reformatting sequence names complete.') ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.exists('%s.ygob.fas' % self.info['Basefile']): inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile']) if not rje.exists('%s.yeast.fas' % self.info['Basefile']): inseq.saveFasta(seqs=yseq, seqfile='%s.yeast.fas' % self.info['Basefile']) self.list['YeastSeq'] = inseq.accList(yseq) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] ~ Setup Program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['Prog'] = None prog = self.getStrLC('Name') if prog in mod: i = self.obj['ProgInfo'] = mod[prog].makeInfo() self.printLog( '#PROG', '%s V%s: %s' % (i.program, i.version, i.description)) progcmd = rje.getCmdList( [], info=i) + self.cmd_list + ['newlog=F'] out = rje.Out(cmd_list=progcmd) out.printIntro(i) #self.debug(prog); self.debug(progcmd) if self.getBool('Help'): progcmd = mod[prog].cmdHelp(i, out, ['help'] + progcmd) self.printLog('#CMD', 'Full %s CmdList: %s' % (i.program, rje.argString( rje.tidyArgs(progcmd, nopath=self.getStrLC('Rest') and not self.dev(), purgelist=purgelist))), screen=False) #self.debug(prog); self.debug(progcmd) ## ~ [1a] ~ Make self.obj['Prog'] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if prog in ['seqlist', 'rje_seqlist']: self.obj['Prog'] = rje_seqlist.SeqList(self.log, progcmd) elif prog in ['uniprot', 'rje_uniprot']: self.obj['Prog'] = rje_uniprot.UniProt(self.log, progcmd) elif prog in ['taxonomy', 'rje_taxonomy']: self.obj['Prog'] = rje_taxonomy.Taxonomy(self.log, progcmd) elif prog in ['tree', 'rje_tree']: self.obj['Prog'] = rje_tree.Tree(self.log, progcmd) elif prog in ['xref', 'rje_xref']: self.obj['Prog'] = rje_xref.XRef(self.log, progcmd) elif prog in ['seq', 'rje_seq']: self.obj['Prog'] = rje_seq.SeqList(self.log, progcmd) elif prog in ['mitab', 'rje_mitab']: self.obj['Prog'] = rje_mitab.MITAB(self.log, progcmd) elif prog in ['dbase', 'database']: self.obj['Prog'] = rje_dbase.DatabaseController( self.log, progcmd) elif prog in ['pydocs']: self.obj['Prog'] = rje_pydocs.PyDoc(self.log, progcmd) elif prog in ['ensembl', 'rje_ensembl']: self.obj['Prog'] = rje_ensembl.EnsEMBL(self.log, progcmd) elif prog in ['genbank', 'rje_genbank']: self.obj['Prog'] = rje_genbank.GenBank(self.log, progcmd) elif prog in ['extatic']: self.obj['Prog'] = extatic.ExTATIC(self.log, progcmd) elif prog in ['revert']: self.obj['Prog'] = revert.REVERT(self.log, progcmd) elif prog in ['fiesta']: self.obj['Prog'] = fiesta.FIESTA(self.log, progcmd) elif prog in ['gablam']: self.obj['Prog'] = gablam.GABLAM(self.log, progcmd) elif prog in ['gopher']: self.obj['Prog'] = gopher.Gopher(self.log, progcmd) elif prog in ['haqesac']: self.obj['Prog'] = haqesac.HAQESAC(self.log, progcmd) elif prog in ['multihaq']: self.obj['Prog'] = multihaq.MultiHAQ(self.log, progcmd) elif prog in ['pingu']: self.obj['Prog'] = pingu.PINGU(self.log, progcmd) elif prog in ['pacbio']: self.obj['Prog'] = rje_pacbio.PacBio(self.log, progcmd) elif prog in ['rje_zen', 'zen']: self.obj['Prog'] = rje_zen.Zen(self.log, progcmd) ### ~ [2] ~ Failure to recognise program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.obj['Prog']: self.printLog( '#ERR', 'Program "%s" not recognised.' % self.getStr('Name')) if self.i() < 0: return False if rje.yesNo('Show SeqSuite help with program options?'): extracmd = cmdHelp(cmd_list=['help'])[1:] if extracmd: self.cmd_list += extracmd self._cmdList() if prog != self.getStrLC('Name'): return self.setup() self.setStr({ 'Name': rje.choice('Give program name (Blank or CTRL+C to quit)') }) if self.getStrLC('Name'): return self.setup() else: return False return self.obj['Prog'] # Setup successful except KeyboardInterrupt: return False except SystemExit: raise except: self.errorLog('Problem during %s setup.' % self.prog()) return False # Setup failed
def addRealUniProt( self, seq, udata, ftlist ): ### Updates feature list ft using real UniProt where possible and makes NR ''' Updates feature list ft using real UniProt where possible and makes NR. >> seq:Sequence object = target of UniFake >> udata:UniProt Entry Data dictionary *Modified in place* >> ftlist:list of feature dictionaries to add to (and make NR) *Modified in place* ''' try: ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.list['UniReal']: return realuni = rje_uniprot.UniProt(self.log, self.cmd_list + ['datout=None']) realuni.readUniProt( clear=True, acclist=[seq.shortName(), seq.info['ID'], seq.info['AccNum']], cleardata=False) if not realuni.list['Entry']: self.printLog('#UNI', 'No Real AccNum for %s' % seq.shortName()) sequence = seq.info['Sequence'][0:] ### ~ [1] ~ Map and Add Features from actual UniProt entry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for entry in realuni.list['Entry']: if 'uniprot' not in self.list['UniFake']: break for key in self.list[ 'UniReal']: #['AC','GN','RC','RX','CC','DR','PE','KW']: if entry.dict['Data'].has_key(key): if udata.has_key(key): udata[key] = entry.dict['Data'][key] + udata[key] else: udata[key] = entry.dict['Data'][key][0:] for ft in entry.list['Feature'][0:]: ft_start = ft['Start'] ft_end = ft['End'] ft_seq = entry.obj['Sequence'].info['Sequence'][ft_start - 1:ft_end] if ft_seq == sequence[ft_start - 1:ft_end]: ftlist.append(ft) continue if not self.opt['FudgeFT']: continue fudge = 1 while fudge: if ft_start - fudge < 1 and ft_end + fudge > len( sequence): fudge = 0 break if ft_start - fudge >= 1 and ft_seq == sequence[ ft_start - 1 - fudge:ft_end - fudge]: fudge = -fudge break if ft_end + fudge <= len( sequence) and ft_seq == sequence[ft_start - 1 + fudge:ft_end + fudge]: break fudge += 1 if fudge: ft['Start'] = ft_start + fudge ft['End'] = ft_end + fudge ftlist.append(ft) ### ~ [2] ~ Make FT list NR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### i = 0 while i < len(ftlist): if ftlist.count(ftlist[i]) > 1: ftlist.pop(i) else: i += 1 except: self.errorLog('UniFake.addRealUniProt error [%s]' % seq.shortName())
def uniFake( self, seqs=[], store=False ): ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs. '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### unifake = string.split(string.join(self.list['UniFake']).lower()) seqlist = self.obj['SeqList'] if seqs: seqlist.seq = seqs else: seqs = seqlist.seq (sx, seqnum) = (0, seqlist.seqNum()) ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = rje_uniprot.UniProt( self.log, self.cmd_list) # UniProt object for saving data if self.info['DatOut'].lower() in ['', 'none']: self.info['DatOut'] = rje.baseFile( seqlist.info['Name']) + '.dat' datfile = self.info['DatOut'] if os.path.exists(datfile): rje.backup(self, datfile) if store: seqlist.obj['UniProt'] = uniprot ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'pfam' in unifake: hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T']) hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile) if os.path.exists(hmmfile): rje.backup(self, hmmfile) hmm.list['HMM'] = [self.info['PFam']] hmm.opt['HMMPFam'] = True else: hmm = None ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list) else: tm = None ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in seqs: sx += 1 name = seq.shortName() self.printLog( '#SEQ', 'Processing %s (%s aa) %s...' % (seq.shortName(), rje.integerString( seq.aaLen()), seq.info['Description'][:50])) try: ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## utmp = 'tmp%s.%s' % (rje.randomString(5), seq.info['AccNum']) open('%s.fas' % utmp, 'w').write( '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence'])) udata = { 'CC': ['-!- Features generated using unifake.py'], 'AC': [] } if seq.info['SpecCode'] in ['Unknown', 'UNK']: seq.info['SpecCode'] = self.info['SPCode'] #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']] #!# Check how well this works. Add spectable? #!# ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['EnsDat'] and rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']): details = rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']) self.addAlias(seq.info['AccNum'], details[0]) self.addAlias(seq.info['AccNum'], details[1]) self.addAlias(seq.info['AccNum'], details[2]) udata['GN'] = [details[2]] for id in [seq.shortName(), seq.info['AccNum']]: if id in self.dict['Aliases']: udata['AC'].append( '%s;' % string.join(self.dict['Aliases'][id], '; ')) ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ft = [] # List of features for sequence for id in [ seq.shortName(), seq.info['AccNum'], seq.info['ID'] ]: if id in self.dict['Features']: ft += self.dict['Features'][id] ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'disorder' in self.list['UniFake']: try: seq.disorder() dis = seq.obj['Disorder'] for disorder in seq.obj['Disorder'].list[ 'RegionDisorder']: ft.append({ 'Type': 'DISORDER', 'Desc': 'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': disorder[0], 'End': disorder[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) for fold in seq.obj['Disorder'].list['RegionFold']: ft.append({ 'Type': 'ORDER', 'Desc': 'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': fold[0], 'End': fold[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) except: self.log.errorLog( 'UniFake disorder problem for %s.' % name) ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if hmm: try: hmm.setInfo({ 'SearchDB': '%s.fas' % utmp, 'HMMOut': '%s.hmm.out' % utmp }) # This will be made for each sequence hmm.search = [] hmm.list['HMMRes'] = [ hmm.hmmSearch(self.info['PFam'], outfile=hmm.info['HMMOut']) ] # Used in hmmTable hmm.hmmTable(outfile=hmmfile, append=True) if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list[ 'ResidueDisorder'] # individual (IUPRed) residue results else: disorder = [] if hmm.search: udata['CC'].append( 'PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'], time.ctime( os.path.getmtime(self.info['PFam'])))) else: udata['CC'].append( '-!- ERROR: PFam HMMer Search failure!') out = {'Type': '!ERROR!', 'Name': name} rje.delimitedFileOutput( self, hmmfile, [ 'Type', 'Name', 'Start', 'End', 'Eval', 'Score' ], datadict=out) for search in hmm.search: for hit in search.hit: for aln in hit.aln: pfamft = { 'Start': aln.stat['SbjStart'], 'End': aln.stat['SbjEnd'], 'Type': 'PFAM', 'Desc': '%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'], aln.stat['Expect'], aln.stat['BitScore']) } if disorder: region = disorder[ aln.stat['SbjStart'] - 1:aln.stat['SbjEnd']] hmmdisorder = float( sum(region)) / len(region) pfamft[ 'Desc'] = '%s; IUPRed: %.2f' % ( pfamft['Desc'], hmmdisorder) if hmmdisorder < self.stat[ 'DisDom']: pfamft['Type'] = 'DOMAIN' ft.append(pfamft) except: self.log.errorLog( 'UniFake PFam HMM problem for %s.' % name) ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'tmhmm' in unifake: try: tmdat = os.popen( '%s %s.fas -short' % (self.info['TMHMM'], utmp)).readlines() domlist = rje_tm.domainList( rje_tm.parseTMHMM(tmdat[0])) for tmdom in domlist: ft.append(tmdom) ft[-1]['Desc'] = 'TMHMM topology prediction' ft[-1]['Start'] = string.atoi(ft[-1]['Start']) ft[-1]['End'] = string.atoi(ft[-1]['End']) if len(domlist) > 1: udata['CC'].append( 'TMHMM: %d TM domains; N-Term %s' % ((len(domlist) - 1) / 2, domlist[0]['Type'])) else: udata['CC'].append('TMHMM: 0 TM domains') except: self.log.errorLog('UniFake TMHMM problem for %s.' % name) ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: try: os.system( '%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'], utmp, utmp)) tm.signalp = {} tm.parseSignalP('%s.signalp' % utmp) sigp = tm.signalp.pop(seq.shortName()) cpos = 0 if sigp['nn_ymax?'] == 'Y': cpos = string.atoi(sigp['nn_ymaxpos']) desc = 'SignalP NN prediction' if sigp['hmm_cmax?'] == 'Y': hmm_c = string.atoi(sigp['hmm_cmaxpos']) if cpos == 0: cpos = hmm_c desc = 'SignalP HMM prediction' else: if hmm_c < cpos: cpos = hmm_c desc = 'SignalP HMM prediction (NN also Y)' else: desc += ' (HMM also Y)' if cpos > 0: ft.append({ 'Type': 'SIGNALP', 'Desc': desc, 'Start': 1, 'End': cpos }) except: self.log.errorLog( 'UniFake SignalP problem for %s.' % name) ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.addRealUniProt(seq, udata, ft) self.deBug(ft) if not store: uniprot.list['Entry'] = [] if uniprot.addFromSeq( seq, data=udata, ft=ft): ### Converts into UniProtEntry object if not store: uniprot.saveUniProt(datfile, append=True) #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName()) ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.log.errorLog('Problem during UniFake(%s)' % name) for tmp in glob.glob('%s*' % utmp): os.unlink(tmp) self.printLog( '#UNIFAKE', '|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx), rje.integerString(seqnum - sx)), log=False) if store: uniprot.saveUniProt(datfile, append=False) if self.opt['CleanUp']: for tmp in glob.glob('TMHMM*'): if os.path.isdir(tmp): os.rmdir(tmp) except: self.errorLog( 'Oh, the shame of it! Trouble during UniFake.uniFake()')
def setup(self): ### Loads data into attributes. '''Loads data into attributes.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ UniProt Object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = self.obj['UniProt'] = rje_uniprot.UniProt( self.log, self.cmd_list) uniprot.readUniProt() if uniprot.entryNum( ) > 0: ### UniProt data loaded. Populate seqlist and domain dictionary. seqlist = rje_seq.SeqList(self.log, self.cmd_list + ['autoload=F']) for entry in uniprot.list['Entry']: seq = entry.obj['Sequence'] seqlist.seq.append(entry.obj['Sequence']) name = seq.shortName() self.dict['Entry'][name] = entry self.dict['Seq'][name] = seq for ft in entry.list['Feature']: if ft['Type'] in self.list['DomFT']: try: dom = string.split(ft['Desc'])[0] if dom not in self.dict['Domain']: self.dict['Domain'][dom] = [] if name not in self.dict['Domain'][dom]: self.dict['Domain'][dom].append(name) except: self.errorLog('Trouble with %s feature %s' % (name, ft)) ## ~ [1b] ~ SeqList only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: seqlist = rje_seq.SeqList(self.log, self.cmd_list) for seq in seqlist.seq: name = seq.shortName() self.dict['Entry'][name] = None self.dict['Seq'][name] = seq #!# Consider adding loading domains from a table #!# ## ~ [1c] ~ Add PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['PPI'] # Dictionary of ShortName-centred ppi = rje.dataDict(self, self.info['PPI']) for hub in ppi: if ppi[hub]['EnsLoci'] == '-': continue ens = ppi[hub]['EnsLoci'] if ens not in self.dict['PPI']: self.dict['PPI'][ens] = [] self.dict['Gene'][ens] = hub for gene in string.split(ppi[hub]['PPI'], ','): if ppi[gene]['EnsLoci'] == '-': continue if ppi[gene]['EnsLoci'] not in self.dict['PPI'][ens]: self.dict['PPI'][ens].append(ppi[gene]['EnsLoci']) ## ~ [1d] ~ Add DDI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['DDI'] = {} if self.info['DDI'].lower() not in ['', 'none']: data = rje.dataDict(self, self.info['DDI'], mainkeys=['Name1'], datakeys=['Name2'], headers=[ 'Pfam1', 'Pfam2', 'Name1', 'Name2', 'Acc1', 'Acc2', 'Code1', 'Code2' ], lists=True) ## ~ Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # (dx, dtot) = (0.0, len(data)) self.deBug(data) try: rje.sortKeys(data) except: self.errorLog('F**k', quitchoice=True) for p1 in rje.sortKeys(data): self.progLog( '\r#DDI', 'Parsing DDI from iPFam: %.1f%%' % (dx / dtot)) if p1 not in self.dict['DDI']: self.dict['DDI'][p1] = [] for p2 in data[p1]['Name2']: if p2 not in self.dict['DDI']: self.dict['DDI'][p2] = [] if p2 not in self.dict['DDI'][p1]: self.dict['DDI'][p1].append(p2) if p1 not in self.dict['DDI'][p2]: self.dict['DDI'][p2].append(p1) self.printLog( '\r#DDI', 'Parsing DDI from iPFam: %s domains' % (rje.integerString(dtot))) ## ~ [1e] ~ Family data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['Fam'] = {} if self.info['Fam'].lower() not in ['', 'none']: data = rje.dataDict(self, self.info['Fam'], mainkeys=['Qry'], datakeys=['Hit'], lists=True) for qry in self.dict['Seq']: self.dict['Fam'][qry] = [] if qry in data: self.dict['Fam'][qry] = data[qry]['Hit'] elif self.dict['Seq'][qry].info['AccNum'] in data: self.dict['Fam'][qry] = data[ self.dict['Seq'][qry].info['AccNum']]['Hit'] if qry not in self.dict['Fam'][qry]: self.dict['Fam'][qry].append(qry) except: self.errorLog('Problem with SLiMPID.setup()', quitchoice=True)
def readPELM( self ): ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc. '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.''' try: ### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### data = rje.dataDict(self, self.info['PELM'], mainkeys=['acc', 'position']) seqdict = {} # Dictionary of Acc:Sequence ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdict = self.dict['PhosphoSites'] for dkey in data: ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (acc, pos) = string.split(dkey) pos = string.atoi(pos) if acc not in pdict: pdict[acc] = {} if pos not in pdict[acc]: pdict[acc][pos] = {} ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if acc not in seqdict: seqdict[acc] = data[dkey]['sequence'] elif seqdict[acc] != data[dkey]['sequence']: self.log.printLog( '#ERR', 'Warning. Sequence mismatch for %s' % acc) if 'aa' not in pdict[acc][pos]: pdict[acc][pos]['aa'] = data[dkey]['code'] elif pdict[acc][pos]['aa'] != data[dkey]['code']: self.log.printLog( '#ERR', 'Warning. PhosphoSite mismatch for %s at pos %d: %s not %s' % (acc, pos, data[dkey]['code'], pdict[acc][pos]['aa'])) if data[dkey]['code'] != seqdict[acc][(pos - 1):pos]: self.log.printLog( '#ERR', 'Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s' % (acc, pos, data[dkey]['code'], seqdict[acc][pos - 1:pos])) ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acclist = rje.sortKeys(seqdict) pelmuni = rje_uniprot.UniProt(self.log, self.cmd_list) # UniProt entry unidict = pelmuni.accDict( acclist) # Dictionary of {acc:UniProtEntry} pelmseq = rje_seq.SeqList(self.log, self.cmd_list + ['seqin=None']) # SeqList object ## ~ [3b] Add one sequence for each AccNum and update seqdict ~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!# for acc in acclist: #!# Make accdict of {acc:Seq} using unidict and seqlist #!# sequence = seqdict[acc] try: uni = unidict[string.split(acc, '-')[0]] desc = uni.obj['Sequence'].info['Description'] name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'], acc, desc) if sequence != uni.obj['Sequence'].info['Sequence']: self.log.printLog( '#WARNING', 'Sequence mismatch for UniProt entry %s' % acc) except: self.log.errorLog('Problem with %s' % acc) name = '%s_UNK__%s' % ( acc, acc) #!# Add sequences where UniProt missing #!# seqdict[acc] = pelmseq._addSeq(name, sequence) ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['FilterSeq']: pelmseq.autoFilter() for acc in acclist: if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc) acclist = rje.sortKeys(seqdict) ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not os.path.exists( self.info['PELMFas'] ) or self.stat['Interactive'] < 0 or rje.yesNo( '%s exists: overwrite?' % self.info['PELMFas']): pelmseq.saveFasta(seqfile=self.info['PELMFas']) self.obj['SeqList'] = pelmseq self.obj['UniProt'] = pelmuni except: self.log.errorLog('Problem during PhosphoSeq.readPELM')