def makePPI(self): ### Generates files for Human-HIV PPI analysis '''Generates files for Human-HIV PPI analysis.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % self.getStr('HIVSeq'),'autoload=T']) if not seqlist.seqs(): return False seqmap = seqlist.seqNameDic('Max') mdb = self.db('HHPIDMap') ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for hivacc in mdb.index('AccHIV'): # map HIV accession numbers on to sequences seqNameDic accnum = string.split(hivacc,'.')[0] hivseq = seqmap[accnum] # extract short HIV name from sequence ID hivgene = string.split(hivseq.shortName(),'_')[0].upper() # create directory named after HIV gene #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene)) rje.mkDir(self,'%s/' % hivgene,log=True) # copy human PPI files into directories, adding HIV gene ex = 0.0; etot = len(mdb.index('AccHIV')[hivacc]) for entry in mdb.indexEntries('AccHIV',hivacc): self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s %s PPI' % (rje.iStr(etot),hivgene)) pfile = self.getStr('PPIDir') + entry['Symbol'] + '.ppi.fas' if rje.exists(pfile): FAS = open('%s/%s.%s.ppi.fas' % (hivgene,hivgene.lower(),entry['Symbol']),'w') FAS.write('>%s\n%s\n' % (hivseq.info['Name'],hivseq.getSequence())) FAS.write(open(pfile,'r').read()) FAS.close() else: self.errorLog('Cannot find human PPI file for %s interactor "%s"' % (entry['HIV'],entry['Symbol']),printerror=False) self.printLog('\r#PPI','Generated human-HIV PPI fasta files for %s %s (%s) PPI.' % (rje.iStr(etot),entry['HIV'],hivgene)) except: self.errorLog('%s.makePPI error' % self); return False
def save(self): ### Saves parsed REST output to files '''Saves parsed REST output to files.''' rbase = '%s%s' % (self.getStr('RestOutDir'), rje.baseFile(self.getStr('RestBase'), strip_path=True, keepext=True)) rje.mkDir(self, self.getStr('RestOutDir')) outputs = rje.sortKeys(self.dict['Output']) if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')] elif self.getStrLC('Rest') in ['full', 'text']: outfile = '%s.rest' % rbase open(outfile, 'w').write(self.restFullOutput()) self.printLog('#OUT', '%s: %s' % (self.getStrLC('Rest'), outfile)) return True elif self.getStrLC('Rest'): self.printLog( '#OUTFMT', 'REST output format "%s" not recognised.' % self.getStrLC('Rest')) if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'): return False outfile = '%s.rest' % rbase open(outfile, 'w').write(self.restFullOutput()) self.printLog('#OUT', 'full: %s' % (outfile)) return True for rkey in outputs: if rkey in self.dict['Outfile']: rje.backup(self, self.dict['Outfile'][rkey]) open(self.dict['Outfile'][rkey], 'w').write(self.dict['Output'][rkey]) self.printLog('#OUT', '%s: %s' % (rkey, self.dict['Outfile'][rkey])) elif rkey not in ['intro']: self.warnLog('No outfile parsed/generated for %s output' % rkey)
def ppi(self): ### Remaining protein-protein interactions '''Remaining protein-protein interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['PPI']: return outdir = 'SLiMPID_PPI' rje.mkDir(self, outdir) badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for hub in rje.sortKeys(self.dict['PPI']): gene = self.dict['Gene'][hub] acc = [] for name in self.dict['PPI'][hub]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.ppi.acc' % (outdir, gene), 'w').write(string.join(acc, '\n')) self.printLog( '#PPI', '%s => %d individual interactors' % (gene, len(acc))) if badname: badname.sort() self.printLog( '#BAD', '%d "bad" protein names: %s' % (len(badname), string.join(badname, '; '))) except: self.errorLog('Problem with SLiMPID.setup()', quitchoice=True)
def complexFasta( self): ### Outputs parsed complex datasets in Fasta format '''Outputs parsed complex datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Complexes/') rje.mkDir(self, datpath) ### Output PPI Datasets ### for complex in rje.sortKeys(self.dict['Complex']): mylist = [] for p2 in self.dict['Complex'][complex]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath, complex) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile) self.log.printLog('#FAS', 'HPRD complex fasta output complete.') except: self.log.errorLog('Error in HPRD.complexFasta()', printerror=True, quitchoice=False) raise
def saveFasta(self): ### Outputs parsed PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Datasets/') rje.mkDir(self, datpath) ## Check Seqs ## for p1 in rje.sortKeys(self.dict['PPI']): if 'Seq' not in self.dict['HPRD'][p1]: #!# KeyError #!# print p1, self.dict['HPRD'][p1] self.deBug('No Seq for %s' % p1) ### All sequences ### self.obj['SeqList'].saveFasta() ### Output PPI Datasets ### for p1 in rje.sortKeys(self.dict['PPI']): mylist = [] for p2 in self.dict['PPI'][p1]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath, self.dict['HPRD'][p1]['gene']) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile) self.log.printLog('#FAS', 'HPRD PPI fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()', printerror=True, quitchoice=False)
def makePPIDatasets(self): ### Generate PPI datasets from pairwise data '''Generate PPI datasets from pairwise data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self, 'YeastPPI/') seqdict = self.dict['SeqDict'] ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx, htot, fx) = (0.0, len(self.dict['PPI']), 0) for hub in rje.sortKeys(self.dict['PPI']): self.progLog( '\r#FAS', 'Generating %s PPI fasta files: %.2f' % (rje.integerString(fx), hx / htot)) hx += 100.0 if len(self.dict['PPI'][hub]) < 3: continue seqs = [] for spoke in self.dict['PPI'][hub]: if spoke not in seqdict: continue seqs.append(seqdict[spoke]) if len(seqs) < 3: continue self.obj['SeqList'].saveFasta(seqs, rje.makePath('YeastPPI/%s.fas' % hub, wholepath=True), log=False) fx += 1 self.printLog( '\r#FAS', 'Generation of %s PPI fasta files from %s hubs complete.' % (rje.integerString(fx), rje.integerString(htot))) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def dpi(self): ### Domain-protein interactions '''Domain-protein interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_DPI' rje.mkDir(self, outdir) dpi = {} # Dictionary of {domain:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['Domain']): dpi[dom] = [] for hub in self.dict['Domain'][dom]: if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][ hub] # Add with redundancy for spoke in dpi[dom][0:]: if dpi[dom].count(spoke) == 1: dpi[dom].remove( spoke) # Must have 2+ domain interactions for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in dpi[dom]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict[ 'PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) dpi[dom] = rje.sortUnique(dpi[dom], False, False) acc = [] for name in dpi[dom]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.dpi.acc' % (outdir, dom), 'w').write(string.join(acc, '\n')) self.printLog('#DPI', '%s domain => %d interactors' % (dom, len(acc))) if badname: badname.sort() self.printLog( '#BAD', '%d "bad" protein names: %s' % (len(badname), string.join(badname, '; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DPI', 'No %s PPI left after DPI removed' % hub, screen=False) self.printLog( '#PPX', '%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len( self.dict['PPI'])), rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.dpi()', quitchoice=True)
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['SeqList'] = rje_seq.SeqList(self.log,['keepblast=T']+self.cmd_list+['autofilter=F','align=F','haqbat=None']) self.obj['SeqList']._checkForDup(True) if not self.seqNum(): self.errorLog('No sequences loaded!',printerror=False); return False if self.opt['AddQueries'] and self.name() not in self.obj['SeqList'].list['Blast2Fas']: self.obj['SeqList'].list['Blast2Fas'].append(self.name()) ### ~ [2] Setup Results Directory ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['HaqDir'].lower() in ['','none']: self.info['HaqDir'] = '%s_HAQESAC/' % rje.baseFile(self.name(), strip_path=True) rje.mkDir(self,self.info['HaqDir']) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def saveReadMe(self, filename='pydocs.txt', append=False): ### Prints docs for modules to file ''' Prints docs for modules to file. >> filename:str = output file name >> append:boolean ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pydoc = self.obj['PyDoc'] if append: self.printLog('#DOC', 'Appending docstrings to %s' % filename) PYDOC = open(filename, 'a') else: rje.mkDir(self, filename) self.printLog('#DOC', 'Writing docstrings to %s' % filename) PYDOC = open(filename, 'w') PYDOC.write(self.readMeHeader()) db = self.db('Module') dx = 0 ### ~ [2] Output Docstrings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for sourcedir in pydoc.list['SourceDir']: PYDOC.write('-%s:\n\n' % sourcedir) for pyfile in db.dataKeys(): entry = db.data(pyfile) module = entry['Module'] if not pyfile.find(sourcedir) >= 0 or not os.path.exists( '%s%s%s.py' % (pydoc.getStr('PyPath'), rje.makePath(sourcedir), module)): continue ## ~ [2a] ~ Module docstring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mtxt = '### ~~~ Module %s ~ [%s] ~~~ ###' % (module, pyfile) while len(mtxt) < 122: mtxt = mtxt[:5] + '~' + mtxt[5:-5] + '~' + mtxt[-5:] try: PYDOC.write('%s\n\n%s\n' % (mtxt, entry['DocString'])) dx += 1 except: self.errorLog('Cannot write DocString for %s' % module, printerror=False) PYDOC.write('%s\n\nDocString Error!\n' % (mtxt)) dx += 1 PYDOC.write('\n\n\n') PYDOC.close() self.printLog( '#DOC', 'Output to %s complete: %s modules.' % (filename, rje.iStr(dx))) except: self.errorLog('Error in %s.saveDocs()' % self.prog())
def setupSourceData(self): ### Main class setup method. #V0.0 '''Setup and optionally download source data.''' try:### ~ [0] Setup Source Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.obj['DB'] sdb = db.addEmptyTable('Source',['Name','File','Status','Entries','URL'],keys=['Name'],log=False) # Store Source info rje.mkDir(self,self.getStr('SourcePath'),True) self.printLog('#~~#','## ~~~~~~~~~~~~~~~~~~~~~~ SETUP TAXONOMIC DATA ~~~~~~~~~~~~~~~~~~~~~ ##',timeout=False) ### ~ [1] Uniprot species codes file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.sourceDataFile('SpecFile'): raise IOError ### ~ [2] NCBI TaxID mapping file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.sourceDataFile('TaxMap'): raise IOError if not self.sourceDataFile('NameMap'): raise IOError return True except: self.errorLog('%s.setupSourceData failure' % self); return False
def makePPI(self): ### Generates files for Human-HIV PPI analysis '''Generates files for Human-HIV PPI analysis.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%s' % self.getStr('HIVSeq'), 'autoload=T']) if not seqlist.seqs(): return False seqmap = seqlist.seqNameDic('Max') mdb = self.db('HHPIDMap') ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for hivacc in mdb.index('AccHIV'): # map HIV accession numbers on to sequences seqNameDic accnum = string.split(hivacc, '.')[0] hivseq = seqmap[accnum] # extract short HIV name from sequence ID hivgene = string.split(hivseq.shortName(), '_')[0].upper() # create directory named after HIV gene #self.progLog('\r#PPI','Generating human-HIV PPI fasta files for %s' % (hivgene)) rje.mkDir(self, '%s/' % hivgene, log=True) # copy human PPI files into directories, adding HIV gene ex = 0.0 etot = len(mdb.index('AccHIV')[hivacc]) for entry in mdb.indexEntries('AccHIV', hivacc): self.progLog( '\r#PPI', 'Generating human-HIV PPI fasta files for %s %s PPI' % (rje.iStr(etot), hivgene)) pfile = self.getStr( 'PPIDir') + entry['Symbol'] + '.ppi.fas' if rje.exists(pfile): FAS = open( '%s/%s.%s.ppi.fas' % (hivgene, hivgene.lower(), entry['Symbol']), 'w') FAS.write('>%s\n%s\n' % (hivseq.info['Name'], hivseq.getSequence())) FAS.write(open(pfile, 'r').read()) FAS.close() else: self.errorLog( 'Cannot find human PPI file for %s interactor "%s"' % (entry['HIV'], entry['Symbol']), printerror=False) self.printLog( '\r#PPI', 'Generated human-HIV PPI fasta files for %s %s (%s) PPI.' % (rje.iStr(etot), entry['HIV'], hivgene)) except: self.errorLog('%s.makePPI error' % self) return False
def fpi(self): ### Family-protein interactions '''Family-protein interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_FPI' rje.mkDir(self,outdir) fpi = {} # Dictionary of {family:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for qry in rje.sortKeys(self.dict['PPI']): try: fam = self.dict['Fam'][qry] if len(fam) < 2: continue except: self.errorLog('Problem with "%s" protein family' % qry); continue fpi[qry] = [] for hub in fam: if hub not in self.dict['PPI']: continue fpi[qry] += self.dict['PPI'][hub] # Add with redundancy for spoke in fpi[qry][0:]: if fpi[qry].count(spoke) == 1: fpi[qry].remove(spoke) # Must have 2+ family interactions for hub in fam: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in fpi[qry]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) fpi[qry] = rje.sortUnique(fpi[qry],False,False) acc = [] gene = self.dict['Gene'][qry] for name in fpi[qry]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.fpi.acc' % (outdir,gene),'w').write(string.join(acc,'\n')) self.printLog('#FPI','%s family => %d interactors' % (gene,len(acc))) if badname: badname.sort() self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#FPI','No %s PPI left after FPI removed' % hub) self.printLog('#PPX','%s of %s PPI hubs remain after FPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.fpi()',quitchoice=True)
def domainFasta(self): ### Outputs parsed domain and domain PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Domain','HPRD','Gene'] dfile = self.info['OutDir'] + 'HPRD.domains.tdt' rje.delimitedFileOutput(self,dfile,headers,'\t') sfile = self.info['OutDir'] + 'HPRD.domsource.tdt' shead = ['Domain','Source'] rje.delimitedFileOutput(self,sfile,shead,'\t') dx = 0.0 for domain in rje.sortKeys(self.dict['Domains']): self.log.printLog('\r#DOM','HPRD Domain output (%s): %.1f%%' % (dfile,dx/len(self.dict['Domains'])),newline=False,log=False) dx += 100.0 for hid in self.dict['Domains'][domain]: datadict = {'Domain':domain,'HPRD':hid,'Gene':self.dict['HPRD'][hid]['gene']} rje.delimitedFileOutput(self,dfile,headers,'\t',datadict) for source in self.dict['DomainSource'][domain]: datadict = {'Domain':domain,'Source':source} rje.delimitedFileOutput(self,sfile,shead,'\t',datadict) self.log.printLog('\r#DOM','HPRD Domain output (%s): %s domains.' % (dfile,rje.integerString(len(self.dict['Domains'])))) ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Domain_Datasets/') rje.mkDir(self,datpath) for domain in rje.sortKeys(self.dict['Domains']): ## Generate a list of all interactors with domain-containing proteins ## plist = [] for p1 in self.dict['Domains'][domain]: if p1 not in self.dict['PPI']: continue for p2 in self.dict['PPI'][p1]: if p2 not in plist: plist.append(p2) plist.sort() ## Generate Sequence list and output ## mylist = [] for p in plist: if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq'] else: mylist.append(self.dict['HPRD'][p]['Seq']) sfile = '%s%s_hprd.fas' % (datpath,domain) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile) else: self.log.printLog('#DOM','No PPI partners for domain "%s"' % domain) self.log.printLog('\r#DOM','HPRD Domain fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False) raise
def makePPIDatasets(self): ### Generate PPI datasets from pairwise data '''Generate PPI datasets from pairwise data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self,'YeastPPI/') seqdict = self.dict['SeqDict'] ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx,htot,fx) = (0.0,len(self.dict['PPI']),0) for hub in rje.sortKeys(self.dict['PPI']): self.progLog('\r#FAS','Generating %s PPI fasta files: %.2f' % (rje.integerString(fx),hx/htot)); hx += 100.0 if len(self.dict['PPI'][hub]) < 3: continue seqs = [] for spoke in self.dict['PPI'][hub]: if spoke not in seqdict: continue seqs.append(seqdict[spoke]) if len(seqs) < 3: continue self.obj['SeqList'].saveFasta(seqs,rje.makePath('YeastPPI/%s.fas' % hub,wholepath=True),log=False); fx+=1 self.printLog('\r#FAS','Generation of %s PPI fasta files from %s hubs complete.' % (rje.integerString(fx),rje.integerString(htot))) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def complexFasta(self): ### Outputs parsed complex datasets in Fasta format '''Outputs parsed complex datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Complexes/') rje.mkDir(self,datpath) ### Output PPI Datasets ### for complex in rje.sortKeys(self.dict['Complex']): mylist = [] for p2 in self.dict['Complex'][complex]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath,complex) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile) self.log.printLog('#FAS','HPRD complex fasta output complete.') except: self.log.errorLog('Error in HPRD.complexFasta()',printerror=True,quitchoice=False) raise
def ppi(self): ### Remaining protein-protein interactions '''Remaining protein-protein interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['PPI']: return outdir = 'SLiMPID_PPI' rje.mkDir(self,outdir) badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for hub in rje.sortKeys(self.dict['PPI']): gene = self.dict['Gene'][hub] acc = [] for name in self.dict['PPI'][hub]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.ppi.acc' % (outdir,gene),'w').write(string.join(acc,'\n')) self.printLog('#PPI','%s => %d individual interactors' % (gene,len(acc))) if badname: badname.sort() self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; '))) except: self.errorLog('Problem with SLiMPID.setup()',quitchoice=True)
def dpi(self): ### Domain-protein interactions '''Domain-protein interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_DPI' rje.mkDir(self,outdir) dpi = {} # Dictionary of {domain:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['Domain']): dpi[dom] = [] for hub in self.dict['Domain'][dom]: if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][hub] # Add with redundancy for spoke in dpi[dom][0:]: if dpi[dom].count(spoke) == 1: dpi[dom].remove(spoke) # Must have 2+ domain interactions for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in dpi[dom]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) dpi[dom] = rje.sortUnique(dpi[dom],False,False) acc = [] for name in dpi[dom]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.dpi.acc' % (outdir,dom),'w').write(string.join(acc,'\n')) self.printLog('#DPI','%s domain => %d interactors' % (dom,len(acc))) if badname: badname.sort() self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DPI','No %s PPI left after DPI removed' % hub,screen=False) self.printLog('#PPX','%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.dpi()',quitchoice=True)
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] Setup SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['SeqList'] = rje_seq.SeqList( self.log, ['keepblast=T'] + self.cmd_list + ['autofilter=F', 'align=F', 'haqbat=None']) self.obj['SeqList']._checkForDup(True) if not self.seqNum(): self.errorLog('No sequences loaded!', printerror=False) return False if self.opt['AddQueries'] and self.name( ) not in self.obj['SeqList'].list['Blast2Fas']: self.obj['SeqList'].list['Blast2Fas'].append(self.name()) ### ~ [2] Setup Results Directory ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['HaqDir'].lower() in ['', 'none']: self.info['HaqDir'] = '%s_HAQESAC/' % rje.baseFile( self.name(), strip_path=True) rje.mkDir(self, self.info['HaqDir']) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self) return False # Setup failed
def gopher(self): ### Sets up data for GOPHER run '''Sets up data for GOPHER run.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self,'BLAST/') rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fasfile='%s.ygob.fas' % self.info['Basefile'],protein=True,force=False) rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fasfile='%s.yeast.fas' % self.info['Basefile'],protein=True,force=False) seqdict = self.obj['SeqList'].seqNameDic('AccNum') ymap = self.dict['PillarMap'] = {} ### ~ [2] Convert Pillars to BLAST IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (px,ptot) = (0.0,len(self.list['Pillars'])); ox = 0 for pillar in self.list['Pillars']: self.progLog('\r#YGOB','Converting YGOB Pillars for GOPHER: %.2f%%' % (px/ptot)); px += 100 newpillar = [] for yid in pillar: seq = rje_sequence.Sequence(self.log,self.cmd_list) seq.opt['Yeast'] = True #self.deBug(yid) seq.info['Name'] = yid seq.extractDetails(gnspacc=True) #self.deBug(seq.info) ygob = seq.info['AccNum'] if ygob in self.dict['Rename']: acc = self.dict['Rename'][ygob] else: acc = ygob ymap[yid] = acc if acc not in seqdict: self.printLog('\r#GENE','Non-coding gene %s (%s)? Cannot find in fasta file' % (acc,yid)); continue try: newpillar.append(seqdict[acc].shortName()) except: print yid, ygob, acc self.errorLog(rje_zen.Zen().wisdom()) if not newpillar: continue for ygob in pillar: acc = ymap[ygob] if acc not in seqdict: continue if acc in self.list['YeastSeq'] or (not self.list['YeastSeq'] and seqdict[acc].info['SpecCode'] == 'YEAST'): open(rje.makePath('BLAST/%s.blast.id' % acc,wholepath=True),'w').write(string.join(newpillar,'\n')) ox += 1 self.progLog('\r#YGOB','Converted YGOB Pillars for GOPHER: %s BLAST ID files.' % rje.iStr(ox)) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def saveFasta(self): ### Outputs parsed PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### Setup ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Datasets/') rje.mkDir(self,datpath) ## Check Seqs ## for p1 in rje.sortKeys(self.dict['PPI']): if 'Seq' not in self.dict['HPRD'][p1]: #!# KeyError #!# print p1, self.dict['HPRD'][p1] self.deBug('No Seq for %s' % p1) ### All sequences ### self.obj['SeqList'].saveFasta() ### Output PPI Datasets ### for p1 in rje.sortKeys(self.dict['PPI']): mylist = [] for p2 in self.dict['PPI'][p1]: if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq'] else: mylist.append(self.dict['HPRD'][p2]['Seq']) sfile = '%s%s_hprd.fas' % (datpath,self.dict['HPRD'][p1]['gene']) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile) self.log.printLog('#FAS','HPRD PPI fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False)
def save(self): ### Saves parsed REST output to files '''Saves parsed REST output to files.''' rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True)) rje.mkDir(self,self.getStr('RestOutDir')) outputs = rje.sortKeys(self.dict['Output']) if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')] elif self.getStrLC('Rest') in ['full','text']: outfile = '%s.rest' % rbase open(outfile,'w').write(self.restFullOutput()) self.printLog('#OUT','%s: %s' % (self.getStrLC('Rest'),outfile)) return True elif self.getStrLC('Rest'): self.printLog('#OUTFMT','REST output format "%s" not recognised.' % self.getStrLC('Rest')) if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'): return False outfile = '%s.rest' % rbase open(outfile,'w').write(self.restFullOutput()) self.printLog('#OUT','full: %s' % (outfile)) return True for rkey in outputs: if rkey in self.dict['Outfile']: rje.backup(self,self.dict['Outfile'][rkey]) open(self.dict['Outfile'][rkey],'w').write(self.dict['Output'][rkey]) self.printLog('#OUT','%s: %s' % (rkey,self.dict['Outfile'][rkey])) elif rkey not in ['intro']: self.warnLog('No outfile parsed/generated for %s output' % rkey)
def saveReadMe(self,filename='pydocs.txt',append=False): ### Prints docs for modules to file ''' Prints docs for modules to file. >> filename:str = output file name >> append:boolean ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pydoc = self.obj['PyDoc'] if append: self.printLog('#DOC','Appending docstrings to %s' % filename) PYDOC = open(filename,'a') else: rje.mkDir(self,filename) self.printLog('#DOC','Writing docstrings to %s' % filename) PYDOC = open(filename,'w') PYDOC.write(self.readMeHeader()) db = self.db('Module') dx = 0 ### ~ [2] Output Docstrings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for sourcedir in pydoc.list['SourceDir']: PYDOC.write('-%s:\n\n' % sourcedir) for pyfile in db.dataKeys(): entry = db.data(pyfile) module = entry['Module'] if not pyfile.find(sourcedir) >= 0 or not os.path.exists('%s%s%s.py' % (pydoc.getStr('PyPath'),rje.makePath(sourcedir),module)): continue ## ~ [2a] ~ Module docstring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mtxt = '### ~~~ Module %s ~ [%s] ~~~ ###' % (module,pyfile) while len(mtxt) < 122: mtxt = mtxt[:5] + '~' + mtxt[5:-5] + '~' + mtxt[-5:] try: PYDOC.write('%s\n\n%s\n' % (mtxt,entry['DocString'])); dx += 1 except: self.errorLog('Cannot write DocString for %s' % module,printerror=False) PYDOC.write('%s\n\nDocString Error!\n' % (mtxt)); dx += 1 PYDOC.write('\n\n\n') PYDOC.close() self.printLog('#DOC','Output to %s complete: %s modules.' % (filename,rje.iStr(dx))) except: self.errorLog('Error in %s.saveDocs()' % self.prog())
def mapPhosByBLAST(self,fasfile): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + ['seqin=%s' % fasfile,'autoload=T','autofilter=F'] qseqlist = rje_seq.SeqList(self.log,scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['','none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name','Pos','AA','PELM','PELMPos','Evidence'] delimit = rje.getDelimit(self.cmd_list,rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self,ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log,self.cmd_list+['formatdb=F']) pblast.setInfo({'Name':'%s.p.blast' % rje.baseFile(fasfile),'DBase':self.info['PELMFas'],'InFile':fasfile}) pblast.setStat({'HitAln':pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID','Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0,self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True,log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F','autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq:0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys(self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False,gaps=False) != hseq.getSequence(case=False,gaps=False): self.log.errorLog('Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec']) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0 aln.muscleAln() #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])) aln._addSeq('PhosAln','-' * qseq.seqLen()) aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']) ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## print '>>>\n', aln.seq, pdict.keys(), rdict.keys() for a in range(qseq.seqLen()): if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1 for hseq in pdict: if hseq.info['Sequence'][a] == '-': continue if hseq != qseq: rdict[hseq] += 1 if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][a] == hseq.info['Sequence'][a]: # Phosphosite pdata = {'Name':search.info['Name'],'Pos':rdict[qseq],'AA':qseq.info['Sequence'][a], 'PELM':hseq.shortName(),'PELMPos':rdict[hseq],'Evidence':'Hom'} if hseq == qseq: pdata['Evidence'] = 'Self' elif hseq in idlist: pdata['Evidence'] = 'ID' rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,pdata) self.addPhos(aln.seq[-1],a,pdata['Evidence']) ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## aln.saveFasta() # Align hits for each > X %ID # Map phosphosites onto alignment and output # return except: self.log.errorLog('Problem during PhosphoSeq.mapPhosByBLAST')
def mapPhosByBLAST( self, fasfile ): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + [ 'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F' ] qseqlist = rje_seq.SeqList(self.log, scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['', 'none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence'] delimit = rje.getDelimit( self.cmd_list, rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self, self.info['PhosRes'], headers, delimit, rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self, ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log, self.cmd_list + ['formatdb=F']) pblast.setInfo({ 'Name': '%s.p.blast' % rje.baseFile(fasfile), 'DBase': self.info['PELMFas'], 'InFile': fasfile }) pblast.setStat({'HitAln': pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID', 'Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True, log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList( self.log, self.cmd_list + ['autoload=F', 'autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq: 0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys( self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False, gaps=False) != hseq.getSequence( case=False, gaps=False): self.log.errorLog( 'Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec'] ) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0
def gopher(self): ### Sets up data for GOPHER run '''Sets up data for GOPHER run.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self, 'BLAST/') rje_blast.BLASTRun(self.log, self.cmd_list).formatDB( fasfile='%s.ygob.fas' % self.info['Basefile'], protein=True, force=False) rje_blast.BLASTRun(self.log, self.cmd_list).formatDB( fasfile='%s.yeast.fas' % self.info['Basefile'], protein=True, force=False) seqdict = self.obj['SeqList'].seqNameDic('AccNum') ymap = self.dict['PillarMap'] = {} ### ~ [2] Convert Pillars to BLAST IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (px, ptot) = (0.0, len(self.list['Pillars'])) ox = 0 for pillar in self.list['Pillars']: self.progLog( '\r#YGOB', 'Converting YGOB Pillars for GOPHER: %.2f%%' % (px / ptot)) px += 100 newpillar = [] for yid in pillar: seq = rje_sequence.Sequence(self.log, self.cmd_list) seq.opt['Yeast'] = True #self.deBug(yid) seq.info['Name'] = yid seq.extractDetails(gnspacc=True) #self.deBug(seq.info) ygob = seq.info['AccNum'] if ygob in self.dict['Rename']: acc = self.dict['Rename'][ygob] else: acc = ygob ymap[yid] = acc if acc not in seqdict: self.printLog( '\r#GENE', 'Non-coding gene %s (%s)? Cannot find in fasta file' % (acc, yid)) continue try: newpillar.append(seqdict[acc].shortName()) except: print yid, ygob, acc self.errorLog(rje_zen.Zen().wisdom()) if not newpillar: continue for ygob in pillar: acc = ymap[ygob] if acc not in seqdict: continue if acc in self.list['YeastSeq'] or ( not self.list['YeastSeq'] and seqdict[acc].info['SpecCode'] == 'YEAST'): open( rje.makePath('BLAST/%s.blast.id' % acc, wholepath=True), 'w').write(string.join(newpillar, '\n')) ox += 1 self.progLog( '\r#YGOB', 'Converted YGOB Pillars for GOPHER: %s BLAST ID files.' % rje.iStr(ox)) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def fpi(self): ### Family-protein interactions '''Family-protein interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_FPI' rje.mkDir(self, outdir) fpi = {} # Dictionary of {family:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for qry in rje.sortKeys(self.dict['PPI']): try: fam = self.dict['Fam'][qry] if len(fam) < 2: continue except: self.errorLog('Problem with "%s" protein family' % qry) continue fpi[qry] = [] for hub in fam: if hub not in self.dict['PPI']: continue fpi[qry] += self.dict['PPI'][hub] # Add with redundancy for spoke in fpi[qry][0:]: if fpi[qry].count(spoke) == 1: fpi[qry].remove( spoke) # Must have 2+ family interactions for hub in fam: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in fpi[qry]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict[ 'PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) fpi[qry] = rje.sortUnique(fpi[qry], False, False) acc = [] gene = self.dict['Gene'][qry] for name in fpi[qry]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.fpi.acc' % (outdir, gene), 'w').write(string.join(acc, '\n')) self.printLog('#FPI', '%s family => %d interactors' % (gene, len(acc))) if badname: badname.sort() self.printLog( '#BAD', '%d "bad" protein names: %s' % (len(badname), string.join(badname, '; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#FPI', 'No %s PPI left after FPI removed' % hub) self.printLog( '#PPX', '%s of %s PPI hubs remain after FPI removed' % (rje.integerString(len( self.dict['PPI'])), rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.fpi()', quitchoice=True)
def mapRegionsToSequences( self): ### Maps tabulates PPI regions onto sequence datasets '''Maps tabulates PPI regions onto sequence datasets.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### minseq = 3 outdir = 'RegPPI/' adddir = 'RegPPIAdd/' rje.mkDir(self, outdir) rje.mkDir(self, adddir) tabfile = 'ppi_region.tdt' region = rje.dataDict(self, tabfile, ['Interactor', 'Protein'], ['Start', 'End'], lists=True) ### ~ [2] Work through each pair in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### px = 0.0 ptot = len(region) fx = 0 for pair in rje.sortKeys(region): self.progLog('\r#FAS', 'Generating fasta files: %.2f%%' % (px / ptot)) px += 100.0 ## ~ [2a] Map sequences to PPI dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## [hub, spoke] = string.split(pair, '\t') try: qryseq = self.dict['SeqObj'][spoke] except: self.printLog( '\n#QRY', 'Spoke gene "%s" missing from Sequence file' % spoke) continue try: spoke = self.dict['Seq2Gene'][spoke] except: self.printLog( '\n#QRY', 'Spoke protein "%s" missing from PPI dictionary' % spoke) continue if hub not in self.dict['PPI']: self.printLog( '\n#HUB', 'Hub gene "%s" missing from PPI dictionary' % hub) continue addspoke = spoke not in self.dict['PPI'][hub] if addspoke: self.dict['PPI'][hub].append(spoke) self.printLog( '\n#PPI', 'Added spoke gene "%s" to hub "%s" interactome' % (spoke, hub)) if len(self.dict['PPI'][hub]) < minseq: self.printLog( '\n#HUB', 'Hub "%s" interactome too small (<%s spokes)' % (hub, minseq)) continue ## ~ [2b] Identify query sequence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## reglist = [] for pos in region[pair]['Start'] + region[pair]['End']: reglist.append(string.atoi(pos)) reglist.sort() qsequence = qryseq.info['Sequence'].lower() self.deBug(len(qsequence)) self.deBug(qsequence) prelen = len(qsequence) while reglist: self.deBug(reglist) try: startx = reglist.pop(0) - 1 endx = reglist.pop(0) except: self.errorLog('%s PPI Region problem: %s' % (pair, region[pair])) continue self.deBug(qsequence[startx - 1:endx + 1].upper()) qsequence = qsequence[:startx] + qsequence[ startx:endx].upper() + qsequence[endx:] self.deBug(qsequence) if len(qsequence) != prelen: self.printLog('#F**K', '%s' % region[pair]) self.printLog('#F**K', qryseq.info['Sequence'].lower()) self.printLog('#F**K', qsequence) raise ValueError ## ~ [2c] Output sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if addspoke: outfile = '%s%s.%s.fas' % (adddir, hub, spoke) ox = 1 else: outfile = '%s%s.%s.fas' % (outdir, hub, spoke) ox = 1 open(outfile, 'w').write('>%s\n%s\n' % (qryseq.info['Name'], qsequence)) for spoke2 in self.dict['PPI'][hub]: if spoke2 == spoke: continue try: sseq = self.dict['SeqObj'][self.dict['Gene2Seq'] [spoke2]] open(outfile, 'a').write( '>%s\n%s\n' % (sseq.info['Name'], sseq.info['Sequence'])) ox += 1 except: pass self.printLog('\n#FAS', '%s sequences output to %s' % (ox, outfile)) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def domainFasta( self ): ### Outputs parsed domain and domain PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Domain', 'HPRD', 'Gene'] dfile = self.info['OutDir'] + 'HPRD.domains.tdt' rje.delimitedFileOutput(self, dfile, headers, '\t') sfile = self.info['OutDir'] + 'HPRD.domsource.tdt' shead = ['Domain', 'Source'] rje.delimitedFileOutput(self, sfile, shead, '\t') dx = 0.0 for domain in rje.sortKeys(self.dict['Domains']): self.log.printLog('\r#DOM', 'HPRD Domain output (%s): %.1f%%' % (dfile, dx / len(self.dict['Domains'])), newline=False, log=False) dx += 100.0 for hid in self.dict['Domains'][domain]: datadict = { 'Domain': domain, 'HPRD': hid, 'Gene': self.dict['HPRD'][hid]['gene'] } rje.delimitedFileOutput(self, dfile, headers, '\t', datadict) for source in self.dict['DomainSource'][domain]: datadict = {'Domain': domain, 'Source': source} rje.delimitedFileOutput(self, sfile, shead, '\t', datadict) self.log.printLog( '\r#DOM', 'HPRD Domain output (%s): %s domains.' % (dfile, rje.integerString(len(self.dict['Domains'])))) ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datpath = self.info['OutDir'] + rje.makePath( 'HPRD_Domain_Datasets/') rje.mkDir(self, datpath) for domain in rje.sortKeys(self.dict['Domains']): ## Generate a list of all interactors with domain-containing proteins ## plist = [] for p1 in self.dict['Domains'][domain]: if p1 not in self.dict['PPI']: continue for p2 in self.dict['PPI'][p1]: if p2 not in plist: plist.append(p2) plist.sort() ## Generate Sequence list and output ## mylist = [] for p in plist: if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq'] else: mylist.append(self.dict['HPRD'][p]['Seq']) sfile = '%s%s_hprd.fas' % (datpath, domain) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile) else: self.log.printLog( '#DOM', 'No PPI partners for domain "%s"' % domain) self.log.printLog('\r#DOM', 'HPRD Domain fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()', printerror=True, quitchoice=False) raise
def mapRegionsToSequences(self): ### Maps tabulates PPI regions onto sequence datasets '''Maps tabulates PPI regions onto sequence datasets.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### minseq = 3 outdir = 'RegPPI/' adddir = 'RegPPIAdd/' rje.mkDir(self,outdir) rje.mkDir(self,adddir) tabfile = 'ppi_region.tdt' region = rje.dataDict(self,tabfile,['Interactor','Protein'],['Start','End'],lists=True) ### ~ [2] Work through each pair in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### px = 0.0; ptot = len(region); fx = 0 for pair in rje.sortKeys(region): self.progLog('\r#FAS','Generating fasta files: %.2f%%' % (px/ptot)); px += 100.0 ## ~ [2a] Map sequences to PPI dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## [hub, spoke] = string.split(pair,'\t') try: qryseq = self.dict['SeqObj'][spoke] except: self.printLog('\n#QRY','Spoke gene "%s" missing from Sequence file' % spoke); continue try: spoke = self.dict['Seq2Gene'][spoke] except: self.printLog('\n#QRY','Spoke protein "%s" missing from PPI dictionary' % spoke); continue if hub not in self.dict['PPI']: self.printLog('\n#HUB','Hub gene "%s" missing from PPI dictionary' % hub); continue addspoke = spoke not in self.dict['PPI'][hub] if addspoke: self.dict['PPI'][hub].append(spoke) self.printLog('\n#PPI','Added spoke gene "%s" to hub "%s" interactome' % (spoke,hub)) if len(self.dict['PPI'][hub]) < minseq: self.printLog('\n#HUB','Hub "%s" interactome too small (<%s spokes)' % (hub,minseq)); continue ## ~ [2b] Identify query sequence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## reglist = [] for pos in region[pair]['Start'] + region[pair]['End']: reglist.append(string.atoi(pos)) reglist.sort() qsequence = qryseq.info['Sequence'].lower() self.deBug(len(qsequence)) self.deBug(qsequence) prelen = len(qsequence) while reglist: self.deBug(reglist) try: startx = reglist.pop(0) - 1; endx = reglist.pop(0) except: self.errorLog('%s PPI Region problem: %s' % (pair,region[pair])); continue self.deBug(qsequence[startx-1:endx+1].upper()) qsequence = qsequence[:startx] + qsequence[startx:endx].upper() + qsequence[endx:] self.deBug(qsequence) if len(qsequence) != prelen: self.printLog('#F**K','%s' % region[pair]) self.printLog('#F**K',qryseq.info['Sequence'].lower()) self.printLog('#F**K',qsequence) raise ValueError ## ~ [2c] Output sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if addspoke: outfile = '%s%s.%s.fas' % (adddir,hub,spoke); ox = 1 else: outfile = '%s%s.%s.fas' % (outdir,hub,spoke); ox = 1 open(outfile,'w').write('>%s\n%s\n' % (qryseq.info['Name'],qsequence)) for spoke2 in self.dict['PPI'][hub]: if spoke2 == spoke: continue try: sseq = self.dict['SeqObj'][self.dict['Gene2Seq'][spoke2]] open(outfile,'a').write('>%s\n%s\n' % (sseq.info['Name'],sseq.info['Sequence'])) ox += 1 except: pass self.printLog('\n#FAS','%s sequences output to %s' % (ox,outfile)) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible