def tabulatePPIRegion(self): ### Tabulates regions of known PPI from DAT file '''Tabulates regions of known PPI from DAT file.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tabfile = 'ppi_region.tdt' unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat' if os.path.exists(tabfile) and not self.opt['Force']: return self.printLog('#REGTAB','%s found. (Force=F)' % tabfile) headers = ['Protein','Start','End','Interactor'] rje.delimitedFileOutput(self,tabfile,headers,rje_backup=True) ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gcmd = "grep -P '(ID |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile self.printLog('#GREP',gcmd) prot = None; rx = 0; plist = []; ilist = [] for gline in os.popen(gcmd).readlines(): if rje.matchExp('ID (\S+)',gline): prot = rje.matchExp('ID (\S+)',gline)[0] if rje.matchExp('FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline): (rstart,rend,rint) = rje.matchExp('FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline) for ppi in string.split(rint): if rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi): datadict = {'Protein':prot,'Start':rstart,'End':rend,'Interactor':rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi)[0]} rje.delimitedFileOutput(self,tabfile,headers,datadict=datadict); rx += 1 if prot not in plist: plist.append(prot) if datadict['Interactor'] not in ilist: ilist.append(datadict['Interactor']) self.progLog('\r#REGTAB','Tabulating regions: %s proteins; %s interactors; %s regions' % (rje.integerString(len(plist)),rje.integerString(len(ilist)), rje.integerString(rx))) self.printLog('\r#REGTAB','Tabulated regions (%s proteins; %s interactors; %s regions) => %s' % (rje.integerString(len(plist)),rje.integerString(len(ilist)),rje.integerString(rx),tabfile)) return True except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def makeGOFile(self): ### Maps GO to sequences and outputs table for R analysis '''Maps GO to sequences and outputs table for R analysis.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outfile = '%s.goer.tdt' % self.info['ResFile'] headers = ['GOID','Motif','Type','Gene','Cons','HomNum','GlobID','LocID','Hyd','SA'] rje.delimitedFileOutput(self,outfile,headers,rje_backup=True) ### ~ [2] ~ Work through dictionary and output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (mx,mtot) = (-100.0,len(self.dict['Occ'])) for motif in rje.sortKeys(self.dict['Occ']): mx += 100.0; self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|CheckSeq) ' % (outfile,(mx/mtot),motif)) ## ~ [2a] ~ Check MinOcc in terms of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for type in rje.sortKeys(self.dict['Occ'][motif]): if len(self.dict['Occ'][motif][type]) < self.stat['MinOcc']: self.dict['Occ'][motif].pop(type) if 'ELM' not in self.dict['Occ'][motif] or len(self.dict['Occ'][motif]) < 2: continue for type in self.dict['Occ'][motif]: ## ~ [2b] ~ Map GO terms and check MinOcc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|Check%s) ' % (outfile,(mx/mtot),motif,type)); godict = {} # Temp dictionary of {GOID:[Seqs]} for gene in self.dict['Occ'][motif][type]: for go in self.ensGO(gene): if go not in godict: godict[go] = [gene] else: godict[go].append(gene) self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|OccGO%s) ' % (outfile,(mx/mtot),motif,type)); for go in rje.sortKeys(godict): if len(godict[go]) < self.stat['MinOcc']: godict.pop(go) ## ~ [2c] ~ Output remaining GO terms occurrences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|Output%s)' % (outfile,(mx/mtot),motif,type)); for go in rje.sortKeys(godict): for gene in godict[go]: for occdict in self.dict['Occ'][motif][type][gene]: datadict = rje.combineDict({'GOID':'GO:%s' % go,'Motif':motif,'Type':type,'Gene':gene},occdict) rje.delimitedFileOutput(self,outfile,headers,datadict=datadict) self.printLog('#OUT','Output for %s %s complete.' % (motif,rje.sortKeys(self.dict['Occ'][motif])),screen=False) self.printLog('\r#OUT','Generating %s output complete! ' % (outfile)) except: self.log.errorLog(rje_zen.Zen().wisdom())
def _setupOutput(self): ### Sets up output files self.str['MapFas','MissFas','MapRes'] '''Sets up output files self.str['MapFas','MissFas','MapRes'].''' ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### delimit = rje.getDelimit(self.cmd_list) if self.str['StartFrom'].lower() in ['','none']: self.str['StartFrom'] = '' else: self.bool['Append'] = True self.printLog('#CMD','StartFrom = "%s" so Append=T' % self.str['StartFrom']) ### ~ [1] General ResFile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### files = {'MapFas':'mapping.fas','MissFas':'missing.fas','MapRes':'mapping.%s' % rje.delimitExt(delimit)} if self.getBool('Combine'): files.pop('MissFas') if self.str['ResFile'].lower() in ['','none']: self.str['ResFile'] = '%s.%s' % (rje.baseFile(self.str['SeqIn']),rje.baseFile(self.str['MapDB'],strip_path=True)) for file in files.keys(): self.setStr({file: self.getStr('ResFile') + '.' + files[file]}) rje.backup(self,self.getStr(file)) ### ~ [2] Headers for MapRes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #!# Consider replacing with rje_db object? #!# self.list['Headers'] = ['Query','Hit','Method','MapRank','BlastRank','EVal','Score'] for qh in ['Query','Hit']: self.list['Headers'] += ['%s_Species' % qh] if self.bool['GablamOut']: for st in ['Len','Sim','ID']: self.list['Headers'] += ['%s_%s' % (qh,st)] rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],delimit)
def run(self,setup=True): ### Main Run Method ''' Main Run Method >> setup:bool [True] = Sets up headers and reads in existing data if present. ''' try: ### ~ Setup & Read existing data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if setup: self.setup() headers = self.list['Headers'] delimit = rje.delimitFromExt(filename=self.info['CardOut']) if os.path.exists(self.info['EnsLoci']): for h in ['EnsLoci','EnsDesc']: if h not in headers: headers.append(h) rje.delimitedFileOutput(self,self.info['CardOut'],headers,delimit,rje_backup=True) ### ~ Read EnsLoci for incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.ensLoci() ### ~ Parse data from GeneCards website and/or previously read aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.processGenes(self.list['Genes']) self.interactiveUpdate() ### ~ Add EnsEMBL EnsLoci data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.addEnsLoci() ### ~ Output GeneCards data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.outputCards() except: self.log.errorLog('Apocalyptic error with GeneCards.run()') raise
def outputCards(self): ### Outputs cards to delimited file '''Outputs cards to delimited file.''' ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### genelist = self.list['Genes'] if self.opt['Purify'] and self.opt['Restrict']: for gene in genelist[0:]: if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: # Replace with symbol genelist.remove(gene) if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol']) delimit = rje.delimitFromExt(filename=self.info['CardOut']) CARDOUT = open(self.info['CardOut'],'a') ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (noens,noloci,ox) = (0,0,0) for gene in rje.sortKeys(self.dict['GeneCard']): if self.opt['Restrict'] and gene not in genelist: continue elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1 self.dict['GeneCard'][gene]['Alias'] = gene self.dict['GeneCard'][gene]['Species'] = self.info['Species'] rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene]) if self.dict['GeneCard'][gene]['Symbol'] == gene: # Not an alias if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1 if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1 CARDOUT.close() self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut'])) self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))
def run(self,imenu=False,outputmap=True,returndict=False): ### Main controlling run Method ''' Main controlling run Method. >> imenu:boolean = Whether to initiate interactive menu if appropriate [False]. >> outputmap:boolean = Whether to output mapping into a file [True] >> returndict:boolean = Whether to return a dictionary of {searchname:mappedname} (no previous mapping) [False] ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.setup(imenu): raise ValueError seqlist = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=T','seqmode=file']) if not seqlist.seqNum(): self.warnLog('No sequences loaded for mapping.'); return {} ## ~ [0a] Setup BLAST Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## blast = rje_blast.BLASTRun(self.log,['blaste=1e-4','blastv=20','blastf=F']+self.cmd_list+['v=-1']) blast.setStr({'DBase':self.getStr('MapDB'),'Type':'blastp','InFile':self.getStr('SeqIn'), 'Name':'%s-%s.blast' % (rje.baseFile(self.str['SeqIn'],True),rje.baseFile(self.str['MapDB'],True))}) blast.setStat({'HitAln':blast.getStat('OneLine')}) blast.list['ResTab'] = ['Search','Hit','GABLAM'] if seqlist.nt(): blast.str['Type'] = 'blastx' ## ~ [0b] Setup Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if outputmap: self._setupOutput() ## Output Files ## if returndict: mapdict = {} else: self._setupMapped() ## Previously Mapped Sequences ## seqx = seqlist.seqNum() ## Number of sequences ## ### ~ [1] BLAST Search Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#BLAST','BLASTing %s vs %s.\n *** This could take some time if files are large. Please be patient! ***' % (self.str['SeqIn'],self.str['MapDB']),log=False) ## ~ [1a] Perform BLAST Unless it exists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## blast.run(format=True) self.obj['DB'] = blast.obj['DB'] ## ~ [1b] Mapping from searches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.debug(self.getStr('MapDB')) self.obj['MapDB'] = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=F','seqmode=file','seqin=%s' % self.str['MapDB']]) self.obj['MapDB'].loadSeq(self.getStr('MapDB')) self.debug('%s' % self.obj['MapDB'].list['Seq']) sx = 0 while seqlist.nextSeq() != None: search = seqlist.getSeq(format='short') sx += 1 ## Check StartFrom ## if self.str['StartFrom']: if self.str['StartFrom'] != search: self.progLog('\r#SKIP','Looking for %s: skipping %d seqs' % (self.str['StartFrom'],sx)) continue self.str['StartFrom'] = '' self.printLog('\r#SKIP','Starting from %s: skipped %d seqs' % (self.str['StartFrom'],sx)) ## Check if in Mapped ## if search in self.list['Mapped']: resdict = {'Query':search,'Hit':search,'Method':'Already Mapped!'} self.printLog('#FAS','%s already in output - not duplicating in %s' % (search,self.str['MapFas'])) rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict) continue ### Map Sequence ### self.printLog('#MAP','Mapping %s seqs: %s of %s' % (self.str['SeqIn'],rje.integerString(sx),rje.integerString(seqx))) mapname = self.mapSeq(seqlist,blast,search) if returndict: mapdict[search] = mapname ### ~ [2] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#MAP','Mapping of %s (%s seqs) complete.' % (self.str['SeqIn'],rje.integerString(seqx))) if os.path.exists(blast.str['Name']) and not (self.getBool('DeBug') or self.test()): os.unlink(blast.str['Name']) #!# Add option to keep BLAST! #!# if returndict: return mapdict except: self.errorLog('Error in SeqMapper.run()',printerror=True,quitchoice=True); raise
def run(self,batch=False): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not batch: self.setupResults() ## ~ [1b] ~ Batch run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not batch and not self.obj['SeqList'].seqs(): ### Look for batch files and run for each batchfiles = rje.getFileList(self,filelist=self.list['Batch'],subfolders=False,summary=True,filecount=0) self.printLog('\r#FILES','Getting files: %5s files for batch run' % rje.integerString(len(batchfiles))) if not batchfiles: self.errorLog('No input files found!',printerror=False) else: bx = 0 for infile in batchfiles: bx += 1 self.printLog('#BATCH','Batch running %s' % infile) bcmd = ['query=1']+self.cmd_list+['autoload=T','seqin=%s' % infile] self.obj['SeqList'] = rje_seq.SeqList(self.log,bcmd) self.run(batch=True) self.opt['Append'] = True self.printLog('#BATCH','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(bx),rje.integerString(len(batchfiles)-bx)),log=False) if self.opt['Win32'] and len(sys.argv) < 2: self.verbose(0,0,'Finished!',1) # Optional pause for win32 return ## ~ [1c] ~ Special run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.info['Special'].lower() == 'allbyall': self.printLog('#RUN','Performing special "all-by-all" pairwise run') self.info['Special'] = '' for i in range(len(self.seqs())-1): self.obj['SeqList'].obj['QuerySeq'] = self.seqs()[i] for j in range(i+1,len(self.seqs())): self.info['Fitness'] = self.info['Phenotype'] = '%d' % (j + 1) self.run(batch=True) self.opt['Append'] = True self.info['Special'] = 'allbyall'; return ## ~ [1d] ~ General setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.setup() ### ~ [2] ~ Price calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.fitness() self.phenotype() self.grouping() for vector in ['Fitness','Phenotype','SeqGroup']: if len(self.list[vector]) != self.qry().seqLen(): self.errorLog('%s vector length (%s) does not match %s sequence length (%s)' % (vector,len(self.list[vector]),self.qry().seqLen()),printerror=False) raise ValueError results = self.price() ### ~ [3] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### results['Dataset'] = rje.baseFile(self.obj['SeqList'].info['Name'],True) results['Query'] = self.qry().shortName() results['Fitness'] = self.info['Fmethod'] results['Phenotype'] = self.info['Pmethod'] results['SeqGroup'] = self.info['SeqGroup'] rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],datadict=results) self.printLog('#OUT','Results output to %s' % self.info['ResFile']) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def setup(self): ### Main class setup method. Makes sumfile if necessary. '''Main class setup method. Makes sumfile if necessary.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile')) if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile'])) if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile() self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile')) if os.path.exists(self.info['SumFile']) and not self.opt['Force']: if rje.yesNo('%s found. Use these results?' % self.info['SumFile']): return self.printLog('#SUM','Summary results file found. No MASCOT processing.') mapgi = False ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for mfile in self.list['ResFiles']: bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile]) bud.info['Name'] = mfile bud.readMascot() self.dict['Searches'][mfile] = bud.dict['Hits'] protacclist = rje.sortKeys(bud.dict['Hits']) for protacc in protacclist: if rje.matchExp('gi\|(\d+)',protacc): mapgi = True accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile)) self.debug(accfile) open(accfile,'w').write(string.join(protacclist,'\n')) self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist))) ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #if mapgi: # mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI') # open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n')) ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list) self.dict['Acc2Seq'] = seqlist.seqNameDic('Max') ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',') rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True) for mfile in rje.sortKeys(self.dict['Searches']): bud = self.dict['Searches'][mfile] for protacc in rje.sortKeys(bud)[0:]: protname = bud[protacc]['prot_acc'] protdesc = bud[protacc]['prot_desc'] if rje.matchExp('gi\|(\d+)',protacc): gi = rje.matchExp('gi\|(\d+)',protacc)[0] try: protname = self.dict['Acc2Seq'][gi].shortName() protdesc = self.dict['Acc2Seq'][gi].info['Description'] except: protname = 'gi_UNK__%s' % gi #x#print protname, protdesc, bud[protacc] for pep in bud[protacc]['Peptides']: data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname, 'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']} rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data) except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### counter = ['>>'] # List containing count times menulist = [('F','Change output file name','outfile','OutFile'),('X','Exit','return',''),('R','Run','return','')] mchoice = rje_menu.menu(self,'WormPump Menu',menulist,choicetext='Please select:',changecase=True,default='R') if mchoice == 'X': return self.printLog('#OUT','Output will be to %s' % self.info['OutFile']) self.printLog('#START','Initialising counter...') ### ~ [2] ~ Perform counts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### wormid = None while counter[-1] != 'X': if wormid: counter.append(rje.choice('ID <ENTER> for new worm | X <ENTER> to exit | <ENTER> for "%s" pump count' % wormid,default='').upper()) else: counter.append(rje.choice('ID <ENTER> for new worm | X <ENTER> to exit',default='').upper()) if counter[-1]: wormid = counter[-1] if wormid == 'X': break self.printLog('#WORM','Worm "%s"' % wormid) counter.append(time.time()) self.deBug(counter) ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### head = ['Worm','Count','WormTime','AbsTime'] rje.delimitedFileOutput(self,self.info['OutFile'],headers=head,rje_backup=True) wormstart = 0.0 wormid = None wtot = 0 while counter: x = counter.pop(0) if x in ['>>','X']: continue if x: wormid = x wormstart = counter[0] wx = 0 wtot += 1 else: if not wormid: continue wx += 1 t = counter.pop(0) tt = time.localtime(t) wdata = {'Worm':wormid,'Count':wx,'WormTime':t-wormstart, #'AbsTime':'%s/%s/%s %s:%s:%s' % (tt[2],tt[1],tt[0],rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))} 'AbsTime':'%s:%s:%s' % (rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))} rje.delimitedFileOutput(self,self.info['OutFile'],headers=head,datadict=wdata) self.printLog('#OUT','Counts for %d worms output to %s' % (wtot,self.info['OutFile'])) rje.choice('<ENTER> to exit') except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def saveMutations(self): ### Outputs parsed mutations into a delimited file '''Outputs parsed mutations into a delimited file.''' try:### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease'] outfile = 'omim_mutations.tdt' rje.delimitedFileOutput(self,outfile,headers,'\t',rje_backup=True) ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for gene in rje.sortKeys(self.dict['Mutations']): for subid in rje.sortKeys(self.dict['Mutations'][gene]): (disease,mutation) = self.dict['Mutations'][gene][subid] (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation) datadict = {'OMIM_ID':string.join(self.dict['Records'][gene],'; '),'SubID':subid,'Gene':gene, 'Pos':pos,'WildAA':wild,'MutAA':mut,'Disease':disease} rje.delimitedFileOutput(self,outfile,headers,'\t',datadict) self.log.printLog('#OUT','OMIM Mutation output to %s complete' % outfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def saveTimePoints(self,filename='',format='tdt',entries=[]): ### Saves TimePoints to a file ''' Saves TimePoints to a file from main TimePoints table. >> filename:str [''] = Output filename. Will use basefile if none given. >> format:str ['tdt'] = Output file format (csv/tsv/txt/db) >> entries:list [] = Entries from main table to output. (All if none given). ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db('TimePoints') if format.lower() in ['','none']: format = string.split(filename.lower(),'.')[-1] if not filename: filename = '%s.%s' % (self.basefile(),format) if not entries: entries = db.entries() ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [2a] Simple delimited file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if format in ['csv','tdt']: self.blanksToEmpty() rje.delimitedFileOutput(self,filename,db.fields(),rje_backup=True) for entry in entries: rje.delimitedFileOutput(self,filename,db.fields(),datadict=entry) ## ~ [2b] Text file output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: self.emptyToBlank() rje.backup(self,filename) OUT = open(filename,'a') for entry in entries: if format == 'db': outlist = [] for field in db.fields(): outlist.append(entry[field]) out_txt = '%s' % outlist OUT.write('(%s);\n' % out_txt[1:-1]) else: # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history) out_text = '%s. (TimePoint) ' % entry['TimePoint Name'] if entry['month'] in ['','blank']: out_text += '%s %s.' % (entry['Year'],entry['yearUnit']) else: out_text += '%s %s, %s %s.' % (entry['Year'],entry['yearUnit'],entry['month'],entry['day']) out_text = '%s %s Source: <%s>[%s].' % (out_text,entry['TimePoint Description'],entry['Source URL'],entry['Source URL']) klist = [] for i in range(1,6): if entry['keyword%d' % i] not in ['','blank']: klist.append(entry['keyword%d' % i]) out_text = '%s (Keywords: %s)' % (out_text,string.join(klist,', ')) OUT.write('%s\n' % out_text) self.printLog('#OUT','%d entries output to %s' % (len(entries),filename)) except: self.errorLog('%s.saveTimePoints(%s) error' % (self,filename)); return False
def hmmTable(self,outfile='',append=False,delimit=None): ### Outputs results table ''' Outputs results table. >> outfile:str = Name of output file >> append:boolean = whether to append file >> delimit:str = Delimiter to use [\t] ''' try: ### Setup ### if not outfile: outfile = self.info['HMMTab'] if outfile.lower() == 'none': self.log.printLog('#TAB','HMMTab = "None": No table output') return False if not delimit: delimit = rje.getDelimit(self.cmd_list,'\t') if not outfile: outfile = '%s.hmmer.%s' % (rje.baseFile(self.info['SearchDB'],True),rje.delimitExt(delimit)) self.readResults() self.log.printLog('#TAB','Tabulating results for %s searches into %s' % (len(self.search),outfile),log=False) ### Setup Resfile ### if self.opt['MySQL']: headers = ['HMM','Hit','Hit_Start','Hit_End','Eval','Score'] else: headers = ['Type','Name','Start','End','Eval','Score'] if not append or not os.path.exists(outfile): rje.delimitedFileOutput(self,outfile,headers,delimit,rje_backup=True) ### Output Search details ### for search in self.search: for hit in search.hit: for aln in hit.aln: out = {'HMM':search.info['Name'],'Type':search.info['Name'], 'Name':hit.info['Name'],'Hit':hit.info['Name'], 'Start':'%d' % aln.stat['SbjStart'], 'End':'%d' % aln.stat['SbjEnd'], 'Hit_Start':'%d' % aln.stat['SbjStart'], 'Hit_End':'%d' % aln.stat['SbjEnd'], 'Eval':'%.2e' % aln.stat['Expect'],'Score':'%.1f' % aln.stat['BitScore']} rje.delimitedFileOutput(self,outfile,headers,delimit,out) self.log.printLog('#OUT','Results for %s searches output to %s.' % (len(self.search),outfile)) except: self.log.errorLog('Fatal Error during hmmTable(%s).' % outfile) raise
def domainFasta(self): ### Outputs parsed domain and domain PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Domain','HPRD','Gene'] dfile = self.info['OutDir'] + 'HPRD.domains.tdt' rje.delimitedFileOutput(self,dfile,headers,'\t') sfile = self.info['OutDir'] + 'HPRD.domsource.tdt' shead = ['Domain','Source'] rje.delimitedFileOutput(self,sfile,shead,'\t') dx = 0.0 for domain in rje.sortKeys(self.dict['Domains']): self.log.printLog('\r#DOM','HPRD Domain output (%s): %.1f%%' % (dfile,dx/len(self.dict['Domains'])),newline=False,log=False) dx += 100.0 for hid in self.dict['Domains'][domain]: datadict = {'Domain':domain,'HPRD':hid,'Gene':self.dict['HPRD'][hid]['gene']} rje.delimitedFileOutput(self,dfile,headers,'\t',datadict) for source in self.dict['DomainSource'][domain]: datadict = {'Domain':domain,'Source':source} rje.delimitedFileOutput(self,sfile,shead,'\t',datadict) self.log.printLog('\r#DOM','HPRD Domain output (%s): %s domains.' % (dfile,rje.integerString(len(self.dict['Domains'])))) ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datpath = self.info['OutDir'] + rje.makePath('HPRD_Domain_Datasets/') rje.mkDir(self,datpath) for domain in rje.sortKeys(self.dict['Domains']): ## Generate a list of all interactors with domain-containing proteins ## plist = [] for p1 in self.dict['Domains'][domain]: if p1 not in self.dict['PPI']: continue for p2 in self.dict['PPI'][p1]: if p2 not in plist: plist.append(p2) plist.sort() ## Generate Sequence list and output ## mylist = [] for p in plist: if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq'] else: mylist.append(self.dict['HPRD'][p]['Seq']) sfile = '%s%s_hprd.fas' % (datpath,domain) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile) else: self.log.printLog('#DOM','No PPI partners for domain "%s"' % domain) self.log.printLog('\r#DOM','HPRD Domain fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False) raise
def scap(self): ### Full SCAP method '''Full SCAP method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### markov = self.obj['Markov'] minx = markov.stat['MinXmer'] maxx = markov.stat['MaxXmer'] headers = ['seq','type','sorted'] for x in range(minx,maxx+1): headers.append('X%d' % x) delimit = rje.getDelimit(self.cmd_list,'\t') scapfile = '%s.%s' % (self.info['Basefile'],rje.delimitExt(delimit)) rje.delimitedFileOutput(self,scapfile,headers,delimit,rje_backup=True) ### ~ [2] SCAP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [2a] Query ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (sx,stot) = (0.0,self.obj['SeqList'].seqNum()) for seq in self.obj['SeqList'].seq: self.progLog('\r#SCAP','SCAP processing Query to %s: %.2f%%' % (scapfile,(sx/stot))); sx += 100.0 datadict = {'seq':seq.shortName(),'type':'qry','sorted':markov.opt['Sorted']} for x in range(minx,maxx+1): datadict['X%d' % x] = self.scapSeq(seq.info['Sequence'],x) if datadict['X%d' % x] > 0.001: datadict['X%d' % x] = '%.4f' % datadict['X%d' % x] else: datadict['X%d' % x] = '%.3e' % datadict['X%d' % x] rje.delimitedFileOutput(self,scapfile,headers,delimit,datadict) self.printLog('\r#SCAP','SCAP processed Query to %s for %s sequences.' % (scapfile,rje.integerString(stot))) ## ~ [2b] Background ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.obj['ScapBack'] != self.obj['SeqList']: (sx,stot) = (0.0,self.obj['ScapBack'].seqNum()) for seq in self.obj['ScapBack'].seq: self.progLog('\r#SCAP','SCAP processing Background to %s: %.2f%%' % (scapfile,(sx/stot))); sx += 100.0 datadict = {'seq':seq.shortName(),'type':'bg','sorted':markov.opt['Sorted']} for x in range(minx,maxx+1): datadict['X%d' % x] = self.scapSeq(seq.info['Sequence'],x) if datadict['X%d' % x] > 0.001: datadict['X%d' % x] = '%.4f' % datadict['X%d' % x] else: datadict['X%d' % x] = '%.3e' % datadict['X%d' % x] rje.delimitedFileOutput(self,scapfile,headers,delimit,datadict) self.printLog('\r#SCAP','SCAP processed Background to %s for %s sequences.' % (scapfile,rje.integerString(stot))) if markov.opt['Sorted']: self.printLog('#SCAP','Sorted SCAP run complete') else: self.printLog('#SCAP','UnSorted SCAP run complete') except: self.errorLog(rje_zen.Zen().wisdom())
def mapSeq(self,seqlist,blast,search,outputmap=True): ### Performs actual mapping of sequence ''' Performs actual mapping of sequence. >> seq:SeqList object containing Sequence Object to be mapped >> blast:BLAST_Run object to perform BLAST and GABLAM >> search:Current BLAST search object for mapping >> outputmap:boolean = Whether to output mapping into a file [True] << returns shortName() of mapped sequence (or None if none) ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seq = seqlist.getSeq(format='tuple') mapseq = self.obj['MapDB'] hits = blast.db('Hit').indexEntries('Query',search) self.printLog('#HITS','%s vs %s = %d hits' % (search,blast.str['DBase'],len(hits))) hitseq = {}; hitdata = {} for entry in hits: hitseq[entry['Hit']] = mapseq.getDictSeq(entry['Hit'],format='tuple') hitdata[entry['Hit']] = entry resdict = {'Query':search,'Hit':None,'Method':'Failed','Query_Species':rje_sequence.specCodeFromName(seq[0])} ### ~ [1] Order Hits and Check Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hits,hitdict) = self.orderHits(seq,hits,hitseq) self.debug(hits) self.debug(hitdict) ### ~ [2] Attempt mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for method in self.list['Mapping']: resdict['Hit'] = self.mapHit(seq,hits,hitdict,method.lower()) if resdict['Hit']: resdict['Method'] = method[:1].upper() + method[1:].lower() break elif method == 'gablam' and (len(hits) > 0): resdict['Method'] = 'Rejected' self.debug(resdict) ### ~[3] Output! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if resdict['Hit']: #hitdict[hit]['Data']['ShortName'] hit = resdict['Hit']['Hit'] # resdict['Hit'] is the BLAST table entry for Hit shortname = hitdict[hit]['Data']['ShortName'] # This is just hit! self.printLog('#MAP','%s mapped to %s (by %s)' % (string.split(seq[0])[0],shortname,resdict['Method'])) ## Update Stats ## self.debug('') resdict['BlastRank'] = hitdata[hit]['Rank'] for key in hitdict[hit]: resdict[key] = hitdict[hit][key] ## Fasta and Redundancy ## if shortname in self.list['Mapped']: self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas'])) else: self.list['Mapped'].append(shortname) if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (hitseq[hit][0],hitseq[hit][1])) resdict['Hit_Species'] = hitdict[hit]['Data']['SpecCode'] resdict['Hit'] = shortname else: ### ~ [2] GREP-based search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if 'grep' in self.list['Mapping']: greplist = []; hitseq = '' self.printLog('#GREP','grep %s %s -B 1' % (seq[1],blast.str['DBase']),log=False) for line in os.popen('grep %s %s -B 1' % (seq[1],blast.str['DBase'])).readlines(): if line[:1] == '>': greplist.append(string.split(line[1:])[0]) elif not hitseq: hitseq = rje.chomp(line) if greplist: shortname = greplist.pop(0) resdict['Hit'] = shortname resdict['Method'] = 'Grep' resdict['Qry_ID'] = '100.0' resdict['Qry_Len'] = len(seq[1]) resdict['Hit_Len'] = len(hitseq) resdict['Hit_ID'] = 100.0 * len(hitseq) / len(seq[1]) try: resdict['Hit_Species'] = string.split(shortname,'_')[1] except: pass if shortname in self.list['Mapped']: self.printLog('#MAP','%s already mapped before - not duplicating in %s' % (shortname,self.str['MapFas'])) else: self.list['Mapped'].append(shortname) if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (shortname,hitseq)) for extra in greplist: self.printLog('#GREP','Warning! Query "%s" also hit "%s" with grep!' % (string.split(seq[0])[0],extra)) if not resdict['Hit'] and self.bool['Combine']: ## Fasta and Redundancy ## shortname = string.split(seq[0])[0] if shortname in self.list['Mapped']: self.printLog('#FAS','%s already in output - not duplicating in %s' % (shortname,self.str['MapFas'])) else: self.list['Mapped'].append(shortname) if outputmap: open(self.str['MapFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1])) elif outputmap: open(self.str['MissFas'],'a').write('>%s\n%s\n' % (seq[0],seq[1])) self.printLog('#MISS','%s mapping %s' % (resdict['Query'],resdict['Method'])) if outputmap: rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict) return resdict['Hit'] except: self.errorLog('Fudgesticks! SeqMapper.mapSeq(%s) has died!' % seq[0],quitchoice=True) return False
def saveTimePoints(self, filename='', format='tdt', entries=[]): ### Saves TimePoints to a file ''' Saves TimePoints to a file from main TimePoints table. >> filename:str [''] = Output filename. Will use basefile if none given. >> format:str ['tdt'] = Output file format (csv/tsv/txt/db) >> entries:list [] = Entries from main table to output. (All if none given). ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db('TimePoints') if format.lower() in ['', 'none']: format = string.split(filename.lower(), '.')[-1] if not filename: filename = '%s.%s' % (self.basefile(), format) if not entries: entries = db.entries() ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [2a] Simple delimited file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if format in ['csv', 'tdt']: self.blanksToEmpty() rje.delimitedFileOutput(self, filename, db.fields(), rje_backup=True) for entry in entries: rje.delimitedFileOutput(self, filename, db.fields(), datadict=entry) ## ~ [2b] Text file output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: self.emptyToBlank() rje.backup(self, filename) OUT = open(filename, 'a') for entry in entries: if format == 'db': outlist = [] for field in db.fields(): outlist.append(entry[field]) out_txt = '%s' % outlist OUT.write('(%s);\n' % out_txt[1:-1]) else: # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history) out_text = '%s. (TimePoint) ' % entry['TimePoint Name'] if entry['month'] in ['', 'blank']: out_text += '%s %s.' % (entry['Year'], entry['yearUnit']) else: out_text += '%s %s, %s %s.' % ( entry['Year'], entry['yearUnit'], entry['month'], entry['day']) out_text = '%s %s Source: <%s>[%s].' % ( out_text, entry['TimePoint Description'], entry['Source URL'], entry['Source URL']) klist = [] for i in range(1, 6): if entry['keyword%d' % i] not in ['', 'blank']: klist.append(entry['keyword%d' % i]) out_text = '%s (Keywords: %s)' % ( out_text, string.join(klist, ', ')) OUT.write('%s\n' % out_text) self.printLog('#OUT', '%d entries output to %s' % (len(entries), filename)) except: self.errorLog('%s.saveTimePoints(%s) error' % (self, filename)) return False
def clusterGoodSeq(self,searchset,data): ### Clusters good sequences returned by search and updates data dictionary '''Clusters good sequences returned by search and updates data dictionary.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Extract Non-rejected sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list+['autoload=F']) #self.deBug(rje.sortKeys(self.dict['Acc2Seq'])) for prot in rje.sortKeys(data): if data[prot]['class'] != 'REJECT': seqlist.seq.append(self.dict['Acc2Seq'][data[prot]['accnum']]) if not seqlist.seqNum(): return self.printLog('#NULL','No %s sequences remain for clustering' % searchset) seqfile = '%s.%s.tmpdb' % (self.info['Basefile'],searchset) seqlist.saveFasta(seqfile=seqfile) seqdict = seqlist.seqNameDic() ### ~ [2] Perform BLAST and generate hit matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### try: blast = rje_blast.blastObj(self.log,['blastf=T','blaste=1e-4']+self.cmd_list+['dna=F'],type='New') clusters = blast.blastClusters(seqfile,seqdict=seqdict,keepblast=False) except: self.errorLog('Problem with new BLAST clustering') blast = rje_blast.blastObj(self.log,['blastf=T','blaste=1e-4']+self.cmd_list+['dna=F'],type='Old') blast.setInfo({'InFile':seqfile,'DBase':seqfile,'Name':'%s.tmp.blast' % self.info['Basefile'],'Type':'blastp'}) blast.setStat({'OneLine':seqlist.seqNum(),'HitAln':0}) blast.formatDB(fasfile=seqfile,force=True,protein=True) blast.blast(cleandb=False,use_existing=False,log=True) blast.readBLAST(gablam=False,unlink=True,log=True) rje_blast.cleanupDB(self,seqfile,deletesource=True) ## ~ [2a] Cluster by BLAST hits ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cluster = {} # Dictionary of {seq:hit seqs} for clustering for search in blast.search: seq = seqdict[search.info['Name']] cluster[seq] = [] for hit in search.hit: cluster[seq].append(seqdict[hit.info['Name']]) #self.deBug(cluster) ## ~ [2b] Combine clusters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## clusters = [] # List of [seqs] in clusters for seq in seqlist.seqs(): if seq not in cluster: continue newcluster = [seq] hits = cluster.pop(seq) while hits: hit = hits.pop(0) if hit not in newcluster: newcluster.append(hit) if hit in cluster: hits += cluster.pop(hit) clusters.append(newcluster) self.printLog('#CLUSTER','%d clusters of %s proteins hits' % (len(clusters),searchset)) #self.deBug(clusters) ### ~ [3] Assign peptides to consensi as "Common", "Cluster" or "Unique" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [3a] Match peptides to sequence lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pepcons = {} for seq in seqlist.seqs(): prot = seq.shortName() #.info['AccNum'] for pep in data[prot]['conpep']: if pep not in pepcons: pepcons[pep] = [] pepcons[pep].append(seq) self.dict['PepSeq'] = pepcons ## ~ [3b] Classify peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['PepTypes'] = {'Common':[],'Cluster':[],'Unique':[]} for pep in pepcons: if len(pepcons[pep]) == 1: self.dict['PepTypes']['Unique'].append(pep); continue pepclus = [] for seq in pepcons[pep]: for cluster in clusters: if seq in cluster and cluster not in pepclus: pepclus.append(cluster) if len(pepclus) == 1: self.dict['PepTypes']['Cluster'].append(pep) else: self.dict['PepTypes']['Common'].append(pep) ## ~ [3c] Summarise Peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog('#PEP','%d different %s Peptide sequences' % (len(pepcons),searchset)) for ptype in ['Common','Cluster','Unique']: self.dict['PepTypes'][ptype].sort() self.printLog('#UNIQ','%d Unique to one consensus' % (len(self.dict['PepTypes']['Unique']))) self.printLog('#CLUS','%d Resticted to one cluster' % (len(self.dict['PepTypes']['Cluster']))) self.printLog('#COMM','%d Common to multiple clusters' % (len(self.dict['PepTypes']['Common']))) ### ~ [4] Update dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### cx = 0 for cluster in clusters: cx += 1 for seq in cluster: prot = seq.shortName() #info['AccNum'] data[prot]['cluster'] = cx ### ~ [5] Peptide Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### peptdt = '%s.%s.peptides.tdt' % (self.info['Basefile'],searchset) pephead = ['Peptide','Classification','Hits'] rje.delimitedFileOutput(self,peptdt,pephead,rje_backup=True) for ptype in ['Common','Cluster','Unique']: for pep in self.dict['PepTypes'][ptype]: data = {'Peptide':pep,'Classification':ptype,'Hits':seqlist.accList(self.dict['PepSeq'][pep])} data['Hits'].sort() data['Hits'] = string.join(data['Hits'],'|') rje.delimitedFileOutput(self,peptdt,pephead,datadict=data) self.printLog('#PEP','Peptide details output to %s' % peptdt) except: self.errorLog(rje_zen.Zen().wisdom())
def run(self): ### Main Run Method '''Main Run Method.''' try:### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['Force'] or not self.loadMutations(): self.parseOMIM() ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!# ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## import pingu pcmd = self.cmd_list + ['fulloutput=F'] ping = self.obj['Pingu'] = pingu.PINGU(self.log,pcmd) ping.run() ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not ping.obj['GeneCards']: return self.log.errorLog('Cannot map EnsLoci without GeneCards.', printerror=False) genecards = ping.obj['GeneCards'].dict['GeneCard'] # GeneCards dictionary ensloci = ping.getEnsLoci() # EnsLoci SeqList object (ping.obj['EnsLoci']) seqdict = ensloci.seqNameDic() if not seqdict: return self.log.errorLog('Failed to read in EnsLoci sequences.', printerror=False) ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['Fudge'] = {} ensback = {} # Dictionary of {EnsLoci name:OMIM gene} mutations = {} # Reorganised dictionary of {gene:{pos:Mutation}} for gene in rje.sortKeys(self.dict['Mutations']): try: seq = seqdict[genecards[gene]['EnsLoci']] except: self.log.printLog('#MAP','No EnsLoci protein mapped for %s' % gene) continue mutations[gene] = {} ensback[genecards[gene]['EnsLoci']] = gene mutpos = {} # Dictionary of {pos:AA} to map onto sequence for subid in rje.sortKeys(self.dict['Mutations'][gene]): (disease,mutation) = self.dict['Mutations'][gene][subid] (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation) mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()] mutations[gene][int(pos)] = self.dict['Mutations'][gene][subid] self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos) self.deBug(self.dict['Fudge']) ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### allslims = {} # Full dictionary of SLiMFinder results matching OMIM genes slimomim = [] # List of (gene,pos) overlapping with SLiMs outfile = 'rje_omim.slimfinder.tdt' dataheaders = string.split('Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign',',') headers = ['Gene','OMIM','SubID','Mutation','Disease'] + dataheaders rje.delimitedFileOutput(self,outfile,headers,delimit='\t',rje_backup=True) for file in glob.glob(self.info['SlimDir'] + '*.occ.csv'): # Potential SLiM slimdata = rje.dataDict(self,file,['Pattern','Hit','Pos','Match'],dataheaders,delimit=',') for occ in slimdata: if slimdata[occ]['Hit'] in ensback: # OMIM gene - possible overlap gene = ensback[slimdata[occ]['Hit']] (start,end) = (int(slimdata[occ]['Pos']),int(slimdata[occ]['EndPos'])) if gene not in allslims: allslims[gene] = {} allslims[gene][occ] = slimdata[occ] for mpos in mutations[gene]: if start <= (mpos + self.dict['Fudge'][seqdict[genecards[gene]['EnsLoci']]]) <= end: self.log.printLog('#OMIMSLIM','%s %s %s (%d-%d) = %s' % (slimdata[occ]['Dataset'],slimdata[occ]['Hit'],slimdata[occ]['Pattern'],start,end,mutations[gene][mpos])) slimdata[occ]['Gene'] = gene slimdata[occ]['OMIM'] = string.join(self.dict['Records'][gene]) slimdata[occ]['Mutation'] = mutations[gene][mpos][1] slimdata[occ]['Disease'] = mutations[gene][mpos][0] rje.delimitedFileOutput(self,outfile,headers,'\t',slimdata[occ]) if (gene,mpos) not in slimomim: slimomim.append((gene,mpos)) ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (inslim,resx,mutx) = (0,0,0) # No. of residues in SLiMs, total residue count + no. mutations that may overlap for gene in mutations: # These are just the genes that mapped to sequences mutx += len(mutations[gene]) resx += seqdict[genecards[gene]['EnsLoci']].aaLen() if gene in allslims: # Partially covered by SLiMs res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen() for occ in allslims[gene]: (start,end) = (int(allslims[gene][occ]['Pos'])-1,int(allslims[gene][occ]['EndPos'])) res = res[:start] + [1] * (end-start) + res[end-1:] self.deBug('%s %d (%d)' % (gene,sum(res),seqdict[genecards[gene]['EnsLoci']].aaLen())) inslim += sum(res) self.log.printLog('#COV','SLiMs have %.1f%% coverage of OMIM gene sequences' % (100.0*inslim/resx)) self.log.printLog('#MUT','%d mutations that could potentially occur in SLiMs' % mutx) self.log.printLog('#PROB','Probability of observed %d mutation overlap = %.4f' % (len(slimomim),rje.binomial(len(slimomim),mutx,float(inslim)/resx,callobj=self))) except: self.log.errorLog(rje_zen.Zen().wisdom())
def run(self): ### Main Run Method '''Main Run Method.''' try: ### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['Force'] or not self.loadMutations(): self.parseOMIM() ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!# ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## import pingu pcmd = self.cmd_list + ['fulloutput=F'] ping = self.obj['Pingu'] = pingu.PINGU(self.log, pcmd) ping.run() ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not ping.obj['GeneCards']: return self.log.errorLog( 'Cannot map EnsLoci without GeneCards.', printerror=False) genecards = ping.obj['GeneCards'].dict[ 'GeneCard'] # GeneCards dictionary ensloci = ping.getEnsLoci( ) # EnsLoci SeqList object (ping.obj['EnsLoci']) seqdict = ensloci.seqNameDic() if not seqdict: return self.log.errorLog( 'Failed to read in EnsLoci sequences.', printerror=False) ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['Fudge'] = {} ensback = {} # Dictionary of {EnsLoci name:OMIM gene} mutations = {} # Reorganised dictionary of {gene:{pos:Mutation}} for gene in rje.sortKeys(self.dict['Mutations']): try: seq = seqdict[genecards[gene]['EnsLoci']] except: self.log.printLog( '#MAP', 'No EnsLoci protein mapped for %s' % gene) continue mutations[gene] = {} ensback[genecards[gene]['EnsLoci']] = gene mutpos = {} # Dictionary of {pos:AA} to map onto sequence for subid in rje.sortKeys(self.dict['Mutations'][gene]): (disease, mutation) = self.dict['Mutations'][gene][subid] (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)', mutation) mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()] mutations[gene][int( pos)] = self.dict['Mutations'][gene][subid] self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos) self.deBug(self.dict['Fudge']) ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### allslims = { } # Full dictionary of SLiMFinder results matching OMIM genes slimomim = [] # List of (gene,pos) overlapping with SLiMs outfile = 'rje_omim.slimfinder.tdt' dataheaders = string.split( 'Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign', ',') headers = ['Gene', 'OMIM', 'SubID', 'Mutation', 'Disease' ] + dataheaders rje.delimitedFileOutput(self, outfile, headers, delimit='\t', rje_backup=True) for file in glob.glob(self.info['SlimDir'] + '*.occ.csv'): # Potential SLiM slimdata = rje.dataDict(self, file, ['Pattern', 'Hit', 'Pos', 'Match'], dataheaders, delimit=',') for occ in slimdata: if slimdata[occ][ 'Hit'] in ensback: # OMIM gene - possible overlap gene = ensback[slimdata[occ]['Hit']] (start, end) = (int(slimdata[occ]['Pos']), int(slimdata[occ]['EndPos'])) if gene not in allslims: allslims[gene] = {} allslims[gene][occ] = slimdata[occ] for mpos in mutations[gene]: if start <= (mpos + self.dict['Fudge'][seqdict[ genecards[gene]['EnsLoci']]]) <= end: self.log.printLog( '#OMIMSLIM', '%s %s %s (%d-%d) = %s' % (slimdata[occ]['Dataset'], slimdata[occ]['Hit'], slimdata[occ]['Pattern'], start, end, mutations[gene][mpos])) slimdata[occ]['Gene'] = gene slimdata[occ]['OMIM'] = string.join( self.dict['Records'][gene]) slimdata[occ]['Mutation'] = mutations[gene][ mpos][1] slimdata[occ]['Disease'] = mutations[gene][ mpos][0] rje.delimitedFileOutput( self, outfile, headers, '\t', slimdata[occ]) if (gene, mpos) not in slimomim: slimomim.append((gene, mpos)) ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (inslim, resx, mutx) = ( 0, 0, 0 ) # No. of residues in SLiMs, total residue count + no. mutations that may overlap for gene in mutations: # These are just the genes that mapped to sequences mutx += len(mutations[gene]) resx += seqdict[genecards[gene]['EnsLoci']].aaLen() if gene in allslims: # Partially covered by SLiMs res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen() for occ in allslims[gene]: (start, end) = (int(allslims[gene][occ]['Pos']) - 1, int(allslims[gene][occ]['EndPos'])) res = res[:start] + [1] * (end - start) + res[end - 1:] self.deBug('%s %d (%d)' % (gene, sum(res), seqdict[genecards[gene]['EnsLoci']].aaLen())) inslim += sum(res) self.log.printLog( '#COV', 'SLiMs have %.1f%% coverage of OMIM gene sequences' % (100.0 * inslim / resx)) self.log.printLog( '#MUT', '%d mutations that could potentially occur in SLiMs' % mutx) self.log.printLog( '#PROB', 'Probability of observed %d mutation overlap = %.4f' % (len(slimomim), rje.binomial( len(slimomim), mutx, float(inslim) / resx, callobj=self))) except: self.log.errorLog(rje_zen.Zen().wisdom())
def domainFasta( self ): ### Outputs parsed domain and domain PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Domain', 'HPRD', 'Gene'] dfile = self.info['OutDir'] + 'HPRD.domains.tdt' rje.delimitedFileOutput(self, dfile, headers, '\t') sfile = self.info['OutDir'] + 'HPRD.domsource.tdt' shead = ['Domain', 'Source'] rje.delimitedFileOutput(self, sfile, shead, '\t') dx = 0.0 for domain in rje.sortKeys(self.dict['Domains']): self.log.printLog('\r#DOM', 'HPRD Domain output (%s): %.1f%%' % (dfile, dx / len(self.dict['Domains'])), newline=False, log=False) dx += 100.0 for hid in self.dict['Domains'][domain]: datadict = { 'Domain': domain, 'HPRD': hid, 'Gene': self.dict['HPRD'][hid]['gene'] } rje.delimitedFileOutput(self, dfile, headers, '\t', datadict) for source in self.dict['DomainSource'][domain]: datadict = {'Domain': domain, 'Source': source} rje.delimitedFileOutput(self, sfile, shead, '\t', datadict) self.log.printLog( '\r#DOM', 'HPRD Domain output (%s): %s domains.' % (dfile, rje.integerString(len(self.dict['Domains'])))) ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datpath = self.info['OutDir'] + rje.makePath( 'HPRD_Domain_Datasets/') rje.mkDir(self, datpath) for domain in rje.sortKeys(self.dict['Domains']): ## Generate a list of all interactors with domain-containing proteins ## plist = [] for p1 in self.dict['Domains'][domain]: if p1 not in self.dict['PPI']: continue for p2 in self.dict['PPI'][p1]: if p2 not in plist: plist.append(p2) plist.sort() ## Generate Sequence list and output ## mylist = [] for p in plist: if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq'] else: mylist.append(self.dict['HPRD'][p]['Seq']) sfile = '%s%s_hprd.fas' % (datpath, domain) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile) else: self.log.printLog( '#DOM', 'No PPI partners for domain "%s"' % domain) self.log.printLog('\r#DOM', 'HPRD Domain fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()', printerror=True, quitchoice=False) raise
def rfAtt(self): ### Generic method ''' Generic method. Add description here (and arguments.) ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhead = [ 'Att', 'RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3', 'ObsRF1', 'ObsRF2', 'ObsRF3', 'ObsRF-1', 'ObsRF-2', 'ObsRF-3', 'ExpRF1', 'ExpRF2', 'ExpRF3', 'ExpRF-1', 'ExpRF-2', 'ExpRF-3' ] rfdata = {} rfobs = {} rfexp = {} ntfreq = {} for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']: rfdata[rf] = {} rfobs[rf] = {} rfexp[rf] = {} for x in rje_seq.alph_protx[:-1] + ['*']: rfdata[rf][x] = 0 rfobs[rf][x] = 0 rfexp[rf][x] = 0 for a1 in rje_seq.alph_protx[:-1] + ['*']: for a2 in rje_seq.alph_protx[:-1] + ['*']: rfdata[rf]['%s%s' % (a1, a2)] = 0 rfobs[rf]['%s%s' % (a1, a2)] = 0 rfexp[rf]['%s%s' % (a1, a2)] = 0 for x in rje_seq.alph_dna[:-1]: ntfreq[x] = 0 seqlist = self.obj['SeqList'] ### ~ [2] Count sequence attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (sx, stot) = (0.0, seqlist.seqNum()) for seq in seqlist.seq: self.progLog( '\r#ATT', 'Counting sequence attributes: %.2f%%' % (sx / stot)) sx += 100.0 for x in seq.info['Sequence']: if x in ntfreq: ntfreq[x] += 1 rf6 = rje_sequence.sixFrameTranslation(seq.info['Sequence']) for r in rf6: rseq = rf6[r] rf = 'RF%d' % r for i in range(len(rseq)): a = rseq[i] dia = rseq[i:i + 2] if a in rfdata[rf]: rfdata[rf][a] += 1 if dia in rfdata[rf]: rfdata[rf][dia] += 1 self.printLog('\r#ATT', 'Counting sequence attributes complete.') ### ~ [3] Calculate Observed & Expected ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ntobs = rje.dictFreq(ntfreq, total=True, newdict=True) ntcomp = {'Total': ntobs['Total']} for xy in ['AT', 'GC']: ntcomp[xy[0]] = ntobs[xy[1]] ntcomp[xy[1]] = ntobs[xy[0]] for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']: aafreq = {} for a in rje_seq.alph_protx[:-1] + ['*']: aafreq[a] = rfdata[rf][a] aafreq = rje.dictFreq(aafreq, total=True, newdict=True) for a in rje_seq.alph_protx[:-1] + ['*']: rfobs[rf][a] = rfdata[rf][a] rfexp[rf][a] = 0 for n1 in 'GATC': for n2 in 'GATC': for n3 in 'GATC': codon = '%s%s%s' % (n1, n2, n3) aa = rje_sequence.dna2prot(codon) if rf[-2] == '-': rfexp[rf][aa] += (int(ntobs['Total'] / 3.0) * ntcomp[n1] * ntcomp[n2] * ntcomp[n3]) else: rfexp[rf][aa] += (int(ntobs['Total'] / 3.0) * ntobs[n1] * ntobs[n2] * ntobs[n3]) #self.deBug('%s: %s x %s x %s x %s' % (aa,(ntobs['Total'] - 2), rfobs[rf][n1], rfobs[rf][n2], rfobs[rf][n3])) #self.deBug('%s: %s' % (aa,rfexp[rf][aa])) for a1 in rje_seq.alph_protx[:-1] + ['*']: for a2 in rje_seq.alph_protx[:-1] + ['*']: rfexp[rf]['%s%s' % (a1, a2)] = (aafreq['Total'] - 1) * aafreq[a1] * aafreq[a2] rfobs[rf]['%s%s' % (a1, a2)] = rfdata[rf]['%s%s' % (a1, a2)] ### ~ [4] Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfile = rje.baseFile(seqlist.info['Name']) + '.rf.tdt' rje.delimitedFileOutput(self, rfile, rfhead, rje_backup=True) for a in rje_seq.alph_protx[:-1] + ['*']: data = {'Att': a} for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']: data['Obs%s' % rf] = rfobs[rf][a] data['Exp%s' % rf] = '%.2f' % rfexp[rf][a] data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a]) rje.delimitedFileOutput(self, rfile, rfhead, datadict=data) for a1 in rje_seq.alph_protx[:-1] + ['*']: for a2 in rje_seq.alph_protx[:-1] + ['*']: a = '%s%s' % (a1, a2) data = {'Att': a} for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']: data['Obs%s' % rf] = rfobs[rf][a] data['Exp%s' % rf] = '%.2f' % rfexp[rf][a] data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a]) rje.delimitedFileOutput(self, rfile, rfhead, datadict=data) self.printLog('#TDT', 'TDT output complete.') except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### counter = ['>>'] # List containing count times menulist = [('F', 'Change output file name', 'outfile', 'OutFile'), ('X', 'Exit', 'return', ''), ('R', 'Run', 'return', '')] mchoice = rje_menu.menu(self, 'WormPump Menu', menulist, choicetext='Please select:', changecase=True, default='R') if mchoice == 'X': return self.printLog('#OUT', 'Output will be to %s' % self.info['OutFile']) self.printLog('#START', 'Initialising counter...') ### ~ [2] ~ Perform counts ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### wormid = None while counter[-1] != 'X': if wormid: counter.append( rje.choice( 'ID <ENTER> for new worm | X <ENTER> to exit | <ENTER> for "%s" pump count' % wormid, default='').upper()) else: counter.append( rje.choice( 'ID <ENTER> for new worm | X <ENTER> to exit', default='').upper()) if counter[-1]: wormid = counter[-1] if wormid == 'X': break self.printLog('#WORM', 'Worm "%s"' % wormid) counter.append(time.time()) self.deBug(counter) ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### head = ['Worm', 'Count', 'WormTime', 'AbsTime'] rje.delimitedFileOutput(self, self.info['OutFile'], headers=head, rje_backup=True) wormstart = 0.0 wormid = None wtot = 0 while counter: x = counter.pop(0) if x in ['>>', 'X']: continue if x: wormid = x wormstart = counter[0] wx = 0 wtot += 1 else: if not wormid: continue wx += 1 t = counter.pop(0) tt = time.localtime(t) wdata = { 'Worm': wormid, 'Count': wx, 'WormTime': t - wormstart, #'AbsTime':'%s/%s/%s %s:%s:%s' % (tt[2],tt[1],tt[0],rje.preZero(tt[3],24),rje.preZero(tt[4],60),rje.preZero(tt[5],60))} 'AbsTime': '%s:%s:%s' % (rje.preZero(tt[3], 24), rje.preZero( tt[4], 60), rje.preZero(tt[5], 60)) } rje.delimitedFileOutput(self, self.info['OutFile'], headers=head, datadict=wdata) self.printLog( '#OUT', 'Counts for %d worms output to %s' % (wtot, self.info['OutFile'])) rje.choice('<ENTER> to exit') except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### mygo = rje_go.GO(self.log,self.cmd_list) mygo.readGO() gomap = rje.dataDict(self,self.info['GOMap'],mainkeys=['Ensembl Gene ID'],datakeys=['GO ID'],lists=True) self.deBug(rje.sortKeys(gomap)[:100]) #!# Replace 'Ensembl Gene ID' with commandline parameter at some point #!# self.printLog('#GOMAP','Loaded GO mappings for %s sequence IDs' % (rje.integerString(len(gomap)))) slimocc = rje.dataDict(self,self.info['OccData'],mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=['Motif','Seq','Start_Pos','End_Pos','Cons','HomNum']) self.printLog('#OCC','Loaded Data for %s motif occurrences.' % (rje.integerString(len(slimocc)))) ## ~ [1a] ~ Sequence mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqlist = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list) seqmap = {} (sx,stot) = (0.0,seqlist.seqNum()) for seq in seqlist.seq: self.progLog('#SEQMAP','Mappings sequence IDs: %.1f%%' % (sx/stot)); sx += 100.0 if rje.matchExp('gene:(\S+)\]',seq.info['Name']): seqmap[seq.shortName()] = rje.matchExp('gene:(\S+)\]',seq.info['Name'])[0] self.printLog('\r#SEQMAP','Mappings %s sequence IDs complete: %s mapped' % (rje.integerString(stot),rje.integerString(len(seqmap)))) self.deBug(rje.sortKeys(seqmap)[:100]) ### ~ [2] ~ Output new data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### goocc = {} outfile = string.join(string.split(self.info['OccData'],'.')[:-1] + ['slimfungo','tdt'],'.') headers = ['GO','Motif','Type','Seq','Start_Pos','End_Pos','Cons','HomNum'] for okey in slimocc.keys(): self.progLog('#NEW','Making new GO occurrences: %s ' % (rje.integerString(len(slimocc)))) data = slimocc.pop(okey) gene = seq = data['Seq'] type = 'fwd' if string.split(data['Motif'],'_')[-1] in ['rev','scram']: type = string.split(data['Motif'],'_')[-1] data['Motif'] = string.join(string.split(data['Motif'],'_')[:-1],'_') if gene not in gomap and gene in seqmap: gene = seqmap[gene] golist = [] if gene in gomap: for id in gomap[gene]: golist += mygo.parents(id) else: golist = ['NoGo'] self.deBug('%s:%s::%s' % (seq,gene,golist)) for id in rje.sortUnique(golist,False,False): if id not in goocc: goocc[id] = {} if motif not in goocc[id]: goocc[id][motif] = {'fwd':[],'rev':[],'scram':[]} goocc[id][motif][type].append(rje.combineDict({'GO':id,'Type':type},data)) self.printLog('\r#NEW','Making new GO occurrences complete. ' % (rje.integerString(len(slimocc)))) rje.delimitedFileOutput(self,outfile,headers,rje_backup=True) (mx,ox,ix,itot) = (0,0,0.0,len(goocc)) for id in rje.sortKeys(goocc): for motif in rje.sortKeys(goocc[id]): for type in rje.sortKeys(goocc[id][motif]): if len(goocc[id][motif][type] < self.stat['MinOcc']): goocc[id][motif].pop(type) if len(goocc[id][motif]) < 2 or 'fwd' not in goocc[id][motif]: continue mx += 1 for type in goocc[id][motif]: for occ in goocc[id][motif][type]: rje.delimitedFileOutput(self,outfile,headers,datadict=occ); ox += 1 self.progLog('#OUT','Output to %s: %.2f%% :: %s motifs; %s occ.' % (outfile,ix/itot,rje.integerString(mx),rje.integerString(ox))) self.printLog('\r#OUT','Output of occurrences to %s is now complete: %s motifs; %s occ.' % (outfile,rje.integerString(mx),rje.integerString(ox))) except: self.log.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
class PhosphoSeq(rje.RJE_Object): ''' PhosphoSeq Class. Author: Rich Edwards (2007). Info:str - PELM = Filename for phosphoELM download [None] - PELMFas = Filename for fasta file output of pELM sequences [pelm.fas] - PhosBlast = Fasta file of sequences to perform phosBLAST method against pELM [None] - PhosRes = Delimited text file containing input sequence, position and evidence [*.phosres.tdt] Opt:boolean - FilterSeq = Apply rje_seq sequence filters to phosphoELM data [False] - UseSpec = Use species codes for determing same species for ID matches [True] - PhosDat = Whether to produce a modified UniProt-format file with potential phosphoSites as features [False] Stat:numeric - IDSim = Percentage identity (GABLAM; phosblast qry) for marking as identity [95.0] - HomSim = Percentage identity (GABLAM; phosblast qry) for marking as homologue [40.0] List:list Dict:dictionary - PhosphoSites = Dictionary of {Seq:{Pos:details}} Obj:RJE_Objects - SeqList = rje_seq.SeqList() object for storing sequences - UniProt = rje_uniprot.UniProt() object for storing UniProt data ''' ######################################################################################################################### ### <1> ### Class Initiation etc.: sets attributes # ######################################################################################################################### def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object.''' ### Basics ### self.infolist = ['PELM', 'PELMFas', 'PhosBlast', 'PhosRes'] self.optlist = ['FilterSeq', 'UseSpec', 'PhosDat'] self.statlist = ['IDSim', 'HomSim'] self.listlist = [] self.dictlist = ['PhosphoSites'] self.objlist = ['SeqList', 'UniProt'] ### Defaults ### self._setDefaults(info='None', opt=False, stat=0.0, obj=None, setlist=True, setdict=True) self.setInfo({'PELMFas': 'pelm.fas'}) self.setStat({'IDSim': 95.0, 'HomSim': 40.0}) self.setOpt({'UseSpec': True}) ######################################################################################################################### def _cmdList(self): ### Sets Attributes from commandline ''' Sets attributes according to commandline parameters: - see .__doc__ or run with 'help' option ''' for cmd in self.cmd_list: try: self._generalCmd(cmd) ### General Options ### ### Class Options ### self._cmdReadList(cmd, 'file', ['PELM', 'PELMFas', 'PhosBlast', 'PhosRes']) self._cmdReadList(cmd, 'opt', ['FilterSeq', 'UseSpec', 'PhosDat']) self._cmdReadList(cmd, 'stat', ['IDSim', 'HomSim']) except: self.log.errorLog('Problem with cmd:%s' % cmd) ######################################################################################################################### ### <3> ### Main Run Methods # ######################################################################################################################### def run(self): ### Main method for standalone functionality '''Main method for standalone functionality.''' self.readPELM() if self.info['PhosBlast'].lower() not in ['', 'none']: self.mapPhosByBLAST(self.info['PhosBlast']) ######################################################################################################################### def readPELM( self ): ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc. '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.''' try: ### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### data = rje.dataDict(self, self.info['PELM'], mainkeys=['acc', 'position']) seqdict = {} # Dictionary of Acc:Sequence ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pdict = self.dict['PhosphoSites'] for dkey in data: ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (acc, pos) = string.split(dkey) pos = string.atoi(pos) if acc not in pdict: pdict[acc] = {} if pos not in pdict[acc]: pdict[acc][pos] = {} ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if acc not in seqdict: seqdict[acc] = data[dkey]['sequence'] elif seqdict[acc] != data[dkey]['sequence']: self.log.printLog( '#ERR', 'Warning. Sequence mismatch for %s' % acc) if 'aa' not in pdict[acc][pos]: pdict[acc][pos]['aa'] = data[dkey]['code'] elif pdict[acc][pos]['aa'] != data[dkey]['code']: self.log.printLog( '#ERR', 'Warning. PhosphoSite mismatch for %s at pos %d: %s not %s' % (acc, pos, data[dkey]['code'], pdict[acc][pos]['aa'])) if data[dkey]['code'] != seqdict[acc][(pos - 1):pos]: self.log.printLog( '#ERR', 'Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s' % (acc, pos, data[dkey]['code'], seqdict[acc][pos - 1:pos])) ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## acclist = rje.sortKeys(seqdict) pelmuni = rje_uniprot.UniProt(self.log, self.cmd_list) # UniProt entry unidict = pelmuni.accDict( acclist) # Dictionary of {acc:UniProtEntry} pelmseq = rje_seq.SeqList(self.log, self.cmd_list + ['seqin=None']) # SeqList object ## ~ [3b] Add one sequence for each AccNum and update seqdict ~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!# for acc in acclist: #!# Make accdict of {acc:Seq} using unidict and seqlist #!# sequence = seqdict[acc] try: uni = unidict[string.split(acc, '-')[0]] desc = uni.obj['Sequence'].info['Description'] name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'], acc, desc) if sequence != uni.obj['Sequence'].info['Sequence']: self.log.printLog( '#WARNING', 'Sequence mismatch for UniProt entry %s' % acc) except: self.log.errorLog('Problem with %s' % acc) name = '%s_UNK__%s' % ( acc, acc) #!# Add sequences where UniProt missing #!# seqdict[acc] = pelmseq._addSeq(name, sequence) ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['FilterSeq']: pelmseq.autoFilter() for acc in acclist: if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc) acclist = rje.sortKeys(seqdict) ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not os.path.exists( self.info['PELMFas'] ) or self.stat['Interactive'] < 0 or rje.yesNo( '%s exists: overwrite?' % self.info['PELMFas']): pelmseq.saveFasta(seqfile=self.info['PELMFas']) self.obj['SeqList'] = pelmseq self.obj['UniProt'] = pelmuni except: self.log.errorLog('Problem during PhosphoSeq.readPELM') ######################################################################################################################### def mapPhosByBLAST( self, fasfile ): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + [ 'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F' ] qseqlist = rje_seq.SeqList(self.log, scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['', 'none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence'] delimit = rje.getDelimit( self.cmd_list, rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self, self.info['PhosRes'], headers, delimit, rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self, ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log, self.cmd_list + ['formatdb=F']) pblast.setInfo({ 'Name': '%s.p.blast' % rje.baseFile(fasfile), 'DBase': self.info['PELMFas'], 'InFile': fasfile }) pblast.setStat({'HitAln': pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID', 'Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True, log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList( self.log, self.cmd_list + ['autoload=F', 'autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq: 0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys( self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False, gaps=False) != hseq.getSequence( case=False, gaps=False): self.log.errorLog( 'Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec'] ) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0 aln.muscleAln( ) #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])) aln._addSeq('PhosAln', '-' * qseq.seqLen()) aln.info['Name'] = '%s%s.phosaln.fas' % (ppath, qseq.info['AccNum']) ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## print '>>>\n', aln.seq, pdict.keys(), rdict.keys() for a in range(qseq.seqLen()): if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1 for hseq in pdict: if hseq.info['Sequence'][a] == '-': continue if hseq != qseq: rdict[hseq] += 1 if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][ a] == hseq.info['Sequence'][a]: # Phosphosite pdata = { 'Name': search.info['Name'], 'Pos': rdict[qseq], 'AA': qseq.info['Sequence'][a], 'PELM': hseq.shortName(), 'PELMPos': rdict[hseq], 'Evidence': 'Hom' } if hseq == qseq: pdata['Evidence'] = 'Self' elif hseq in idlist: pdata['Evidence'] = 'ID' rje.delimitedFileOutput(self, self.info['PhosRes'], headers, delimit, pdata) self.addPhos(aln.seq[-1], a, pdata['Evidence']) ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## aln.saveFasta()
def mapPhosByBLAST( self, fasfile ): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + [ 'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F' ] qseqlist = rje_seq.SeqList(self.log, scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['', 'none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence'] delimit = rje.getDelimit( self.cmd_list, rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self, self.info['PhosRes'], headers, delimit, rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self, ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log, self.cmd_list + ['formatdb=F']) pblast.setInfo({ 'Name': '%s.p.blast' % rje.baseFile(fasfile), 'DBase': self.info['PELMFas'], 'InFile': fasfile }) pblast.setStat({'HitAln': pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID', 'Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True, log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList( self.log, self.cmd_list + ['autoload=F', 'autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq: 0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys( self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False, gaps=False) != hseq.getSequence( case=False, gaps=False): self.log.errorLog( 'Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec'] ) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0
def uniFake( self, seqs=[], store=False ): ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs. '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### unifake = string.split(string.join(self.list['UniFake']).lower()) seqlist = self.obj['SeqList'] if seqs: seqlist.seq = seqs else: seqs = seqlist.seq (sx, seqnum) = (0, seqlist.seqNum()) ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = rje_uniprot.UniProt( self.log, self.cmd_list) # UniProt object for saving data if self.info['DatOut'].lower() in ['', 'none']: self.info['DatOut'] = rje.baseFile( seqlist.info['Name']) + '.dat' datfile = self.info['DatOut'] if os.path.exists(datfile): rje.backup(self, datfile) if store: seqlist.obj['UniProt'] = uniprot ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'pfam' in unifake: hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T']) hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile) if os.path.exists(hmmfile): rje.backup(self, hmmfile) hmm.list['HMM'] = [self.info['PFam']] hmm.opt['HMMPFam'] = True else: hmm = None ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list) else: tm = None ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in seqs: sx += 1 name = seq.shortName() self.printLog( '#SEQ', 'Processing %s (%s aa) %s...' % (seq.shortName(), rje.integerString( seq.aaLen()), seq.info['Description'][:50])) try: ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## utmp = 'tmp%s.%s' % (rje.randomString(5), seq.info['AccNum']) open('%s.fas' % utmp, 'w').write( '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence'])) udata = { 'CC': ['-!- Features generated using unifake.py'], 'AC': [] } if seq.info['SpecCode'] in ['Unknown', 'UNK']: seq.info['SpecCode'] = self.info['SPCode'] #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']] #!# Check how well this works. Add spectable? #!# ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['EnsDat'] and rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']): details = rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']) self.addAlias(seq.info['AccNum'], details[0]) self.addAlias(seq.info['AccNum'], details[1]) self.addAlias(seq.info['AccNum'], details[2]) udata['GN'] = [details[2]] for id in [seq.shortName(), seq.info['AccNum']]: if id in self.dict['Aliases']: udata['AC'].append( '%s;' % string.join(self.dict['Aliases'][id], '; ')) ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ft = [] # List of features for sequence for id in [ seq.shortName(), seq.info['AccNum'], seq.info['ID'] ]: if id in self.dict['Features']: ft += self.dict['Features'][id] ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'disorder' in self.list['UniFake']: try: seq.disorder() dis = seq.obj['Disorder'] for disorder in seq.obj['Disorder'].list[ 'RegionDisorder']: ft.append({ 'Type': 'DISORDER', 'Desc': 'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': disorder[0], 'End': disorder[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) for fold in seq.obj['Disorder'].list['RegionFold']: ft.append({ 'Type': 'ORDER', 'Desc': 'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': fold[0], 'End': fold[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) except: self.log.errorLog( 'UniFake disorder problem for %s.' % name) ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if hmm: try: hmm.setInfo({ 'SearchDB': '%s.fas' % utmp, 'HMMOut': '%s.hmm.out' % utmp }) # This will be made for each sequence hmm.search = [] hmm.list['HMMRes'] = [ hmm.hmmSearch(self.info['PFam'], outfile=hmm.info['HMMOut']) ] # Used in hmmTable hmm.hmmTable(outfile=hmmfile, append=True) if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list[ 'ResidueDisorder'] # individual (IUPRed) residue results else: disorder = [] if hmm.search: udata['CC'].append( 'PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'], time.ctime( os.path.getmtime(self.info['PFam'])))) else: udata['CC'].append( '-!- ERROR: PFam HMMer Search failure!') out = {'Type': '!ERROR!', 'Name': name} rje.delimitedFileOutput( self, hmmfile, [ 'Type', 'Name', 'Start', 'End', 'Eval', 'Score' ], datadict=out) for search in hmm.search: for hit in search.hit: for aln in hit.aln: pfamft = { 'Start': aln.stat['SbjStart'], 'End': aln.stat['SbjEnd'], 'Type': 'PFAM', 'Desc': '%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'], aln.stat['Expect'], aln.stat['BitScore']) } if disorder: region = disorder[ aln.stat['SbjStart'] - 1:aln.stat['SbjEnd']] hmmdisorder = float( sum(region)) / len(region) pfamft[ 'Desc'] = '%s; IUPRed: %.2f' % ( pfamft['Desc'], hmmdisorder) if hmmdisorder < self.stat[ 'DisDom']: pfamft['Type'] = 'DOMAIN' ft.append(pfamft) except: self.log.errorLog( 'UniFake PFam HMM problem for %s.' % name) ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'tmhmm' in unifake: try: tmdat = os.popen( '%s %s.fas -short' % (self.info['TMHMM'], utmp)).readlines() domlist = rje_tm.domainList( rje_tm.parseTMHMM(tmdat[0])) for tmdom in domlist: ft.append(tmdom) ft[-1]['Desc'] = 'TMHMM topology prediction' ft[-1]['Start'] = string.atoi(ft[-1]['Start']) ft[-1]['End'] = string.atoi(ft[-1]['End']) if len(domlist) > 1: udata['CC'].append( 'TMHMM: %d TM domains; N-Term %s' % ((len(domlist) - 1) / 2, domlist[0]['Type'])) else: udata['CC'].append('TMHMM: 0 TM domains') except: self.log.errorLog('UniFake TMHMM problem for %s.' % name) ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: try: os.system( '%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'], utmp, utmp)) tm.signalp = {} tm.parseSignalP('%s.signalp' % utmp) sigp = tm.signalp.pop(seq.shortName()) cpos = 0 if sigp['nn_ymax?'] == 'Y': cpos = string.atoi(sigp['nn_ymaxpos']) desc = 'SignalP NN prediction' if sigp['hmm_cmax?'] == 'Y': hmm_c = string.atoi(sigp['hmm_cmaxpos']) if cpos == 0: cpos = hmm_c desc = 'SignalP HMM prediction' else: if hmm_c < cpos: cpos = hmm_c desc = 'SignalP HMM prediction (NN also Y)' else: desc += ' (HMM also Y)' if cpos > 0: ft.append({ 'Type': 'SIGNALP', 'Desc': desc, 'Start': 1, 'End': cpos }) except: self.log.errorLog( 'UniFake SignalP problem for %s.' % name) ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.addRealUniProt(seq, udata, ft) self.deBug(ft) if not store: uniprot.list['Entry'] = [] if uniprot.addFromSeq( seq, data=udata, ft=ft): ### Converts into UniProtEntry object if not store: uniprot.saveUniProt(datfile, append=True) #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName()) ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.log.errorLog('Problem during UniFake(%s)' % name) for tmp in glob.glob('%s*' % utmp): os.unlink(tmp) self.printLog( '#UNIFAKE', '|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx), rje.integerString(seqnum - sx)), log=False) if store: uniprot.saveUniProt(datfile, append=False) if self.opt['CleanUp']: for tmp in glob.glob('TMHMM*'): if os.path.isdir(tmp): os.rmdir(tmp) except: self.errorLog( 'Oh, the shame of it! Trouble during UniFake.uniFake()')
def picsi(self): ### Cleans up cross-species search results '''Cleans up cross-species search results.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datafile = self.info['SumFile'] delimit = rje.delimitFromExt(filename=self.info['SumFile']) data = {} # search:{hit:{???}} pep2prot = {} # search:{peptide:[hits]} id2prot = {} # search:{id:hit} prot2desc = {} fullpeplist = {} pepcon = {} # Convert pep:longer pep speclist = [] # List of species codes ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True) for ikey in rje.sortKeys(indata): (search,id) = string.split(ikey,delimit) prot = indata[ikey]['prot_acc'][0] desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','') if desc[3:7] == 'Name': desc = desc[9:] prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc)) indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|') pepconv = string.replace(indata[ikey]['pep_seq'],'I','L') pepconv = string.replace(pepconv,'Q','K') peplist = rje.sortUnique(string.split(pepconv,'|')) indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|') if search not in data: data[search] = {} pep2prot[search] = {} id2prot[search] = {} fullpeplist[search] = [] pepcon[search] = {} fullpeplist[search] += peplist id2prot[search][id] = prot spec = string.split(prot,'_')[1] if spec not in speclist: speclist.append(spec) data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec, 'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:], 'pep_rem':0} try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum'] except: data[search][prot]['accnum'] = string.split(prot,'__')[-1] for pep in peplist: if pep not in pep2prot[search]: pep2prot[search][pep] = [] pep2prot[search][pep].append(prot) ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for search in fullpeplist: fullpeplist[search] = rje.sortUnique(fullpeplist[search]) for pep in fullpeplist[search][0:]: for pep2 in fullpeplist[search]: if pep != pep2 and pep in pep2: pepcon[search][pep] = pep2 fullpeplist[search].remove(pep) break for pep in pepcon[search]: while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]] self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search)) #self.deBug(pepcon[search]) #self.deBug(rje.sortKeys(pep2prot[search])) pp = 0; pm = 0 for prot in data[search]: for pep in data[search][prot]['conpep'][0:]: if pep in pepcon[search]: newpep = pepcon[search][pep] if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1 data[search][prot]['conpep'].remove(pep); pm += 0 if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot) if pep in pep2prot[search]: pep2prot[search].pop(pep) data[search][prot]['pep_con'] = len(data[search][prot]['conpep']) self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm)) ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for search in pep2prot: ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## remx = 0 for prot in data[search]: if data[search][prot]['spec'] != self.info['QrySpec']: continue for pep in data[search][prot]['conpep']: for prot2 in pep2prot[search][pep][0:]: if data[search][prot2]['spec'] == self.info['QrySpec']: continue pep2prot[search][pep].remove(prot2) data[search][prot2]['conpep'].remove(pep) data[search][prot2]['pep_rem'] += 1; remx += 1 self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec'])) ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for prot in data[search]: if len(data[search][prot]['conpep']) < 2: for pep in data[search][prot]['conpep']: #if pep in pep2prot[search] and prot in pep2prot[search][pep]: pep2prot[search][pep].remove(prot) ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ux = 0 for pep in pep2prot[search]: #self.deBug(pep) if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1 self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search)) ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0} rx = 0 for prot in data[search]: #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue data[search][prot]['pep_red'] = 0 # Redundant peptides found in proteins with unique peptides data[search][prot]['pep_nr'] = 0 # Redundant peptides found only in proteins without unique peptides for pep in data[search][prot]['conpep']: if pep2prot[search][pep] == [prot]: continue upep = False for prot2 in pep2prot[search][pep]: if data[search][prot2]['pep_uniq']: upep = True; break if upep: data[search][prot]['pep_red'] += 1 # Redundant peptide found in unique protein else: data[search][prot]['pep_nr'] += 1 # Redundant peptide NOT found in unique protein if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1 elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE' elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR' else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1 summary[data[search][prot]['class']] += 1 self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search)) for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x)) ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### speclist.sort() species = {} for spec in speclist: try: grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read() species[spec] = string.split(grep,':')[-4] self.printLog('#SPEC','%s = %s' % (spec,species[spec])) except: species[spec] = '?' ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile']) headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep'] if self.dict['Acc2Seq']: headers.insert(3,'cluster') rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True) for search in rje.sortKeys(data): if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search]) for prot in rje.sortKeys(data[search]): if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']): data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1] else: data[search][prot]['species'] = species[data[search][prot]['spec']] rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot]) except: self.errorLog('Errg')
def rfAtt(self): ### Generic method ''' Generic method. Add description here (and arguments.) ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhead = ['Att','RF1','RF2','RF3','RF-1','RF-2','RF-3','ObsRF1','ObsRF2','ObsRF3','ObsRF-1','ObsRF-2','ObsRF-3','ExpRF1','ExpRF2','ExpRF3','ExpRF-1','ExpRF-2','ExpRF-3'] rfdata = {}; rfobs = {}; rfexp = {}; ntfreq = {} for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']: rfdata[rf] = {}; rfobs[rf] = {}; rfexp[rf] = {} for x in rje_seq.alph_protx[:-1] + ['*']: rfdata[rf][x] = 0; rfobs[rf][x] = 0; rfexp[rf][x] = 0 for a1 in rje_seq.alph_protx[:-1] + ['*']: for a2 in rje_seq.alph_protx[:-1] + ['*']: rfdata[rf]['%s%s' % (a1,a2)] = 0; rfobs[rf]['%s%s' % (a1,a2)] = 0; rfexp[rf]['%s%s' % (a1,a2)] = 0 for x in rje_seq.alph_dna[:-1]: ntfreq[x] = 0 seqlist = self.obj['SeqList'] ### ~ [2] Count sequence attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (sx,stot) = (0.0,seqlist.seqNum()) for seq in seqlist.seq: self.progLog('\r#ATT','Counting sequence attributes: %.2f%%' % (sx/stot)); sx += 100.0 for x in seq.info['Sequence']: if x in ntfreq: ntfreq[x] += 1 rf6 = rje_sequence.sixFrameTranslation(seq.info['Sequence']) for r in rf6: rseq = rf6[r] rf = 'RF%d' % r for i in range(len(rseq)): a = rseq[i]; dia = rseq[i:i+2] if a in rfdata[rf]: rfdata[rf][a] += 1 if dia in rfdata[rf]: rfdata[rf][dia] += 1 self.printLog('\r#ATT','Counting sequence attributes complete.') ### ~ [3] Calculate Observed & Expected ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ntobs = rje.dictFreq(ntfreq,total=True,newdict=True) ntcomp = {'Total':ntobs['Total']} for xy in ['AT','GC']: ntcomp[xy[0]] = ntobs[xy[1]]; ntcomp[xy[1]] = ntobs[xy[0]] for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']: aafreq = {} for a in rje_seq.alph_protx[:-1] + ['*']: aafreq[a] = rfdata[rf][a] aafreq = rje.dictFreq(aafreq,total=True,newdict=True) for a in rje_seq.alph_protx[:-1] + ['*']: rfobs[rf][a] = rfdata[rf][a]; rfexp[rf][a] = 0 for n1 in 'GATC': for n2 in 'GATC': for n3 in 'GATC': codon = '%s%s%s' % (n1, n2, n3) aa = rje_sequence.dna2prot(codon) if rf[-2] == '-': rfexp[rf][aa] += (int(ntobs['Total']/3.0) * ntcomp[n1] * ntcomp[n2] * ntcomp[n3]) else: rfexp[rf][aa] += (int(ntobs['Total']/3.0) * ntobs[n1] * ntobs[n2] * ntobs[n3]) #self.deBug('%s: %s x %s x %s x %s' % (aa,(ntobs['Total'] - 2), rfobs[rf][n1], rfobs[rf][n2], rfobs[rf][n3])) #self.deBug('%s: %s' % (aa,rfexp[rf][aa])) for a1 in rje_seq.alph_protx[:-1] + ['*']: for a2 in rje_seq.alph_protx[:-1] + ['*']: rfexp[rf]['%s%s' % (a1,a2)] = (aafreq['Total'] - 1) * aafreq[a1] * aafreq[a2] rfobs[rf]['%s%s' % (a1,a2)] = rfdata[rf]['%s%s' % (a1,a2)] ### ~ [4] Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfile = rje.baseFile(seqlist.info['Name']) + '.rf.tdt' rje.delimitedFileOutput(self,rfile,rfhead,rje_backup=True) for a in rje_seq.alph_protx[:-1] + ['*']: data = {'Att':a} for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']: data['Obs%s' % rf] = rfobs[rf][a] data['Exp%s' % rf] = '%.2f' % rfexp[rf][a] data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a]) rje.delimitedFileOutput(self,rfile,rfhead,datadict=data) for a1 in rje_seq.alph_protx[:-1] + ['*']: for a2 in rje_seq.alph_protx[:-1] + ['*']: a = '%s%s' % (a1,a2) data = {'Att':a} for rf in ['RF1','RF2','RF3','RF-1','RF-2','RF-3']: data['Obs%s' % rf] = rfobs[rf][a] data['Exp%s' % rf] = '%.2f' % rfexp[rf][a] data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a]) rje.delimitedFileOutput(self,rfile,rfhead,datadict=data) self.printLog('#TDT','TDT output complete.') except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def tabulatePPIRegion( self): ### Tabulates regions of known PPI from DAT file '''Tabulates regions of known PPI from DAT file.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tabfile = 'ppi_region.tdt' unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat' if os.path.exists(tabfile) and not self.opt['Force']: return self.printLog('#REGTAB', '%s found. (Force=F)' % tabfile) headers = ['Protein', 'Start', 'End', 'Interactor'] rje.delimitedFileOutput(self, tabfile, headers, rje_backup=True) ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gcmd = "grep -P '(ID |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile self.printLog('#GREP', gcmd) prot = None rx = 0 plist = [] ilist = [] for gline in os.popen(gcmd).readlines(): if rje.matchExp('ID (\S+)', gline): prot = rje.matchExp('ID (\S+)', gline)[0] if rje.matchExp( 'FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)', gline): (rstart, rend, rint) = rje.matchExp( 'FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)', gline) for ppi in string.split(rint): if rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi): datadict = { 'Protein': prot, 'Start': rstart, 'End': rend, 'Interactor': rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi)[0] } rje.delimitedFileOutput(self, tabfile, headers, datadict=datadict) rx += 1 if prot not in plist: plist.append(prot) if datadict['Interactor'] not in ilist: ilist.append(datadict['Interactor']) self.progLog( '\r#REGTAB', 'Tabulating regions: %s proteins; %s interactors; %s regions' % (rje.integerString( len(plist)), rje.integerString( len(ilist)), rje.integerString(rx))) self.printLog( '\r#REGTAB', 'Tabulated regions (%s proteins; %s interactors; %s regions) => %s' % (rje.integerString(len(plist)), rje.integerString( len(ilist)), rje.integerString(rx), tabfile)) return True except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for fasta in glob.glob('*.fasta'): fas = fasta[:-2] if os.path.exists(fas): continue sx = 0 for line in open(fasta,'r').readlines(): if line[:1] == '>': try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line) except: name = rje.matchExp('^>(\S+)',line)[0] if len(string.split(name,'|')) == 3: name = '6rf_NEIME__%s' % string.split(name,'|')[2] open(fas,'a').write('>%s\n' % name) elif len(string.split(name,'|')) == 5: name = 'ref_NEIME__%s' % string.split(name,'|')[3] open(fas,'a').write('>%s %s\n' % (name,desc)) else: print string.split(name,'|'); raise ValueError self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1 else: open(fas,'a').write(line) self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta)) rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True) ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhits = {} # Dictionary of {hit:['File:hit_num']} acc = 'MC58_6RF_Hits.acc'; open(acc,'w') gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt' cx = 0 for csv in glob.glob('MC58_6RF_CSV/*.CSV'): cx += 1 file = os.path.basename(csv)[:-4] hits = False for line in open(csv,'r').readlines(): if line.find('prot_hit_num,prot_acc') == 0: hits = True elif hits: data = rje.readDelimit(line,',') if len(data) < 2: continue [num,name] = data[:2] try: name = string.split(name,'|')[2] except: continue if name not in rfhits: open(acc,'a').write('6rf_NEIME__%s\n' % name) rfhits[name] = [] id = '%s:%s' % (file,num) if id not in rfhits[name]: rfhits[name].append(id) self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits)))) self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc)) ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(gfile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Hits.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam() ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdata = rje.dataDict(self,gfile,['Qry'],['HitNum']) zeros = [] for hit in gdata: if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit) zeros = rje.sortUnique(zeros,False) open('6rf_zeros.acc','w').write(string.join(zeros,'\n')) self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros)) ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt' if not os.path.exists(ufile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Zeros.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam() gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True) fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True) headers = gdata.pop('Headers') headers.insert(1,'Sample') headers.append('BestHit') rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True) for rf in rje.sortKeys(gdata): rfcut = string.split(rf,'__')[1] gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ') gdata[rf]['Qry'] = rfcut try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0] except: gdata[rf]['BestHit'] = '-' rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf]) except: self.errorLog(rje_zen.Zen().wisdom()) self.printLog('#ZEN',rje_zen.Zen().wisdom())
def setupResults(self): ### Main results setup method. '''Main results setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['Headers'] = ['Dataset','Query','Fitness','Phenotype','SeqGroup','CovP','CovB','CovW','Price','Ratio'] rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],rje_backup=True) except: self.errorLog('Problem during %s setupResults().' % self); raise
def uniFake(self,seqs=[],store=False): ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs. '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### unifake = string.split(string.join(self.list['UniFake']).lower()) seqlist = self.obj['SeqList'] if seqs: seqlist.seq = seqs else: seqs = seqlist.seq (sx,seqnum) = (0,seqlist.seqNum()) ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = rje_uniprot.UniProt(self.log,self.cmd_list) # UniProt object for saving data if self.info['DatOut'].lower() in ['','none']: self.info['DatOut'] = rje.baseFile(seqlist.info['Name']) + '.dat' datfile = self.info['DatOut'] if os.path.exists(datfile): rje.backup(self,datfile) if store: seqlist.obj['UniProt'] = uniprot ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'pfam' in unifake: hmm = rje_hmm.HMMRun(self.log,self.cmd_list+['force=T']) hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile) if os.path.exists(hmmfile): rje.backup(self,hmmfile) hmm.list['HMM'] = [self.info['PFam']] hmm.opt['HMMPFam'] = True else: hmm = None ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: tm = rje_tm.TM(self.log,self.cmd_list) else: tm = None ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in seqs: sx += 1 name = seq.shortName() self.printLog('#SEQ','Processing %s (%s aa) %s...' % (seq.shortName(),rje.integerString(seq.aaLen()),seq.info['Description'][:50])) try: ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## utmp = 'tmp%s.%s' % (rje.randomString(5),seq.info['AccNum']) open('%s.fas' % utmp,'w').write('>%s\n%s\n' % (seq.shortName(),seq.info['Sequence'])) udata = {'CC':['-!- Features generated using unifake.py'],'AC':[]} if seq.info['SpecCode'] in ['Unknown','UNK']: seq.info['SpecCode'] = self.info['SPCode'] #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']] #!# Check how well this works. Add spectable? #!# ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['EnsDat'] and rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name']): details = rje.matchExp('\[acc:(\S+) pep:(\S+) gene:(\S+)\]',seq.info['Name']) self.addAlias(seq.info['AccNum'],details[0]) self.addAlias(seq.info['AccNum'],details[1]) self.addAlias(seq.info['AccNum'],details[2]) udata['GN'] = [details[2]] for id in [seq.shortName(),seq.info['AccNum']]: if id in self.dict['Aliases']: udata['AC'].append('%s;' % string.join(self.dict['Aliases'][id],'; ')) ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ft = [] # List of features for sequence for id in [seq.shortName(),seq.info['AccNum'],seq.info['ID']]: if id in self.dict['Features']: ft += self.dict['Features'][id] ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'disorder' in self.list['UniFake']: try: seq.disorder() dis = seq.obj['Disorder'] for disorder in seq.obj['Disorder'].list['RegionDisorder']: ft.append({'Type':'DISORDER','Desc':'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'],'Start':disorder[0],'End':disorder[1]}) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % (ft[-1]['Desc'],dis.stat['IUCut']) for fold in seq.obj['Disorder'].list['RegionFold']: ft.append({'Type':'ORDER','Desc':'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'],'Start':fold[0],'End':fold[1]}) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % (ft[-1]['Desc'],dis.stat['IUCut']) except: self.log.errorLog('UniFake disorder problem for %s.' % name) ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if hmm: try: hmm.setInfo({'SearchDB':'%s.fas' % utmp,'HMMOut':'%s.hmm.out' % utmp}) # This will be made for each sequence hmm.search = [] hmm.list['HMMRes'] = [hmm.hmmSearch(self.info['PFam'],outfile=hmm.info['HMMOut'])] # Used in hmmTable hmm.hmmTable(outfile=hmmfile,append=True) if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list['ResidueDisorder'] # individual (IUPRed) residue results else: disorder = [] if hmm.search: udata['CC'].append('PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'],time.ctime(os.path.getmtime(self.info['PFam'])))) else: udata['CC'].append('-!- ERROR: PFam HMMer Search failure!') out = {'Type':'!ERROR!','Name':name} rje.delimitedFileOutput(self,hmmfile,['Type','Name','Start','End','Eval','Score'],datadict=out) for search in hmm.search: for hit in search.hit: for aln in hit.aln: pfamft = {'Start':aln.stat['SbjStart'],'End':aln.stat['SbjEnd'],'Type':'PFAM', 'Desc':'%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'],aln.stat['Expect'],aln.stat['BitScore'])} if disorder: region = disorder[aln.stat['SbjStart']-1:aln.stat['SbjEnd']] hmmdisorder = float(sum(region)) / len(region) pfamft['Desc'] = '%s; IUPRed: %.2f' % (pfamft['Desc'],hmmdisorder) if hmmdisorder < self.stat['DisDom']: pfamft['Type'] = 'DOMAIN' ft.append(pfamft) except: self.log.errorLog('UniFake PFam HMM problem for %s.' % name) ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'tmhmm' in unifake: try: tmdat = os.popen('%s %s.fas -short' % (self.info['TMHMM'],utmp)).readlines() domlist = rje_tm.domainList(rje_tm.parseTMHMM(tmdat[0])) for tmdom in domlist: ft.append(tmdom) ft[-1]['Desc'] = 'TMHMM topology prediction' ft[-1]['Start'] = string.atoi(ft[-1]['Start']) ft[-1]['End'] = string.atoi(ft[-1]['End']) if len(domlist) > 1: udata['CC'].append('TMHMM: %d TM domains; N-Term %s' % ((len(domlist)-1)/2,domlist[0]['Type'])) else: udata['CC'].append('TMHMM: 0 TM domains') except: self.log.errorLog('UniFake TMHMM problem for %s.' % name) ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: try: os.system('%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'],utmp,utmp)) tm.signalp = {} tm.parseSignalP('%s.signalp' % utmp) sigp = tm.signalp.pop(seq.shortName()) cpos = 0 if sigp['nn_ymax?'] == 'Y': cpos = string.atoi(sigp['nn_ymaxpos']) desc = 'SignalP NN prediction' if sigp['hmm_cmax?'] == 'Y': hmm_c = string.atoi(sigp['hmm_cmaxpos']) if cpos == 0: cpos = hmm_c desc = 'SignalP HMM prediction' else: if hmm_c < cpos: cpos = hmm_c desc = 'SignalP HMM prediction (NN also Y)' else: desc += ' (HMM also Y)' if cpos > 0: ft.append({'Type':'SIGNALP','Desc':desc,'Start':1,'End':cpos}) except: self.log.errorLog('UniFake SignalP problem for %s.' % name) ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.addRealUniProt(seq,udata,ft) self.deBug(ft) if not store: uniprot.list['Entry'] = [] if uniprot.addFromSeq(seq,data=udata,ft=ft): ### Converts into UniProtEntry object if not store: uniprot.saveUniProt(datfile,append=True) #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName()) ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.log.errorLog('Problem during UniFake(%s)' % name) for tmp in glob.glob('%s*' % utmp): os.unlink(tmp) self.printLog('#UNIFAKE','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx),rje.integerString(seqnum-sx)),log=False) if store: uniprot.saveUniProt(datfile,append=False) if self.opt['CleanUp']: for tmp in glob.glob('TMHMM*'): if os.path.isdir(tmp): os.rmdir(tmp) except: self.errorLog('Oh, the shame of it! Trouble during UniFake.uniFake()')
def codons(self): ### Main codons analysis method '''Main codons analysis method.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F','seqnr=F','gnspacc=F'] cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd) gcode = rje_sequence.genetic_code ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nts = ['A','C','G','T'] ntfreq = cds.aaFreq(alphabet=nts) codons = [] # List of codons obs_cfreq = {} # Observed codon frequencies nts_cfreq = {} # Codon frequencies from NT frequencies obs_tfreq = {} # Observed triplet frequencies nts_tfreq = {} # Predicted triplet frequencies from NT frequencies ocd_tfreq = {} # Predicted triplet frequencies from observed codon frequencies ncd_tfreq = {} # Predicted triplet frequencies from nt-predicted codon frequencies ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for n1 in nts: for n2 in nts: for n3 in nts: cod = '%s%s%s' % (n1,n2,n3) codons.append(cod) aa = gcode[string.replace(cod,'T','U')] if aa not in obs_cfreq: obs_cfreq[aa] = {} if aa not in nts_cfreq: nts_cfreq[aa] = {} obs_cfreq[aa][cod] = 0.0 nts_cfreq[aa][cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] obs_tfreq[cod] = 0.0 nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] ocd_tfreq[cod] = 0.0 ncd_tfreq[cod] = 0.0 nts_tfreq = rje.dictFreq(nts_tfreq,total=False) # Normalise triplet freq. for aa in nts_cfreq: nts_cfreq[aa] = rje.dictFreq(nts_cfreq[aa],total=False) # Normalise codon freq. self.log.printLog('#FREQ','Frequency dictionaries set up.') ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (sx,stot) = (0.0,cds.seqNum()) for seq in cds.seq[0:]: self.log.printLog('\r#OBS','Calculating observed codon frequencies: %.1f%%' % (sx/stot),newline=False,log=False) sx += 100.0 try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise try: exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',pos)[0] except: try: exons = rje.matchExp('^join\((\d+\..*\.\d+)\)',pos)[0] except: exons = rje.matchExp('^(\d+\.\.\d+)',pos)[0] self.deBug(exons) exons = string.split(exons,',') elen = [] try: for exon in exons: (start,end) = string.split(exon,'..') elen.append(string.atoi(end) - string.atoi(start) + 1) except: self.log.errorLog(id) cds.seq.remove(seq) continue if pos[:4] == 'comp': elen.reverse() seq.list['ExonLen'] = elen self.deBug(elen) if sum(elen) != seq.aaLen(): self.log.errorLog('%s exon length error' % id,printerror=False) if seq.aaLen()/3 != seq.aaLen()/3.0: self.log.errorLog('%s not a multiple of 3nt long!' % id,printerror=False) cds.seq.remove(seq) continue #!# Add use exon option - single full-length exon if false (mature mRNA) #!# sequence = seq.info['Sequence'][0:] if string.count(sequence,'N') > 0: self.log.errorLog('%s has 1+ Ns!' % id,printerror=False) cds.seq.remove(seq) continue while sequence: cod = sequence[:3] sequence = sequence[3:] aa = gcode[string.replace(cod,'T','U')] obs_cfreq[aa][cod] += 1 for aa in obs_cfreq: obs_cfreq[aa] = rje.dictFreq(obs_cfreq[aa],total=False) # Normalise codon freq. self.log.printLog('\r#OBS','Calculating observed codon frequencies complete.') ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (sx,stot) = (0.0,cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#TRIP','Calculating triplet frequencies: %.1f%%' % (sx/stot),newline=False,log=False) sx += 100.0 elen = seq.list['ExonLen'] sequence = seq.info['Sequence'][0:] aa = '' cod = '' ax = 0 # Measure sequence length processed for exon boundary checks while sequence: prevcod = cod cod = sequence[:3] prevaa = aa sequence = sequence[3:] aa = gcode[string.replace(cod,'T','U')] ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for cod2 in obs_cfreq[aa]: if elen[0] > ax + 3: # Exon boundary beyond this codon ocd_tfreq[cod2] += obs_cfreq[aa][cod2] ncd_tfreq[cod2] += nts_cfreq[aa][cod2] if prevaa: # Look at overlap with previous codon for cod1 in obs_cfreq[prevaa]: for i in range(1,3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = cod1[i:] + cod2[:i] ocd_tfreq[acod] += (obs_cfreq[prevaa][cod1] * obs_cfreq[aa][cod2]) ncd_tfreq[acod] += (nts_cfreq[prevaa][cod1] * nts_cfreq[aa][cod2]) ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if elen[0] > ax + 3: # Exon boundary beyond this codon obs_tfreq[cod] += 1 if prevcod: # Look at overlap with previous codon for i in range(1,3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = prevcod[i:] + cod[:i] obs_tfreq[acod] += 1 # Check exons # ax += 3 if ax >= elen[0]: ax -= elen.pop(0) obs_tfreq = rje.dictFreq(obs_tfreq,total=False) ocd_tfreq = rje.dictFreq(ocd_tfreq,total=False) ncd_tfreq = rje.dictFreq(ncd_tfreq,total=False) self.log.printLog('\r#TRIP','Calculating triplet frequencies complete.') ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Triplet','AA','Degen','Obs_Codon','NT_Codon','Obs_Trip','NT_Trip','ObCod_Trip','NTCod_Trip'] tfile = 'quad_triplet.tdt' rje.delimitedFileOutput(self,tfile,headers,rje_backup=True) for cod in codons: aa = gcode[string.replace(cod,'T','U')] datadict = {'Triplet':cod,'AA':aa,'Degen':len(obs_cfreq[aa]),'Obs_Codon':obs_cfreq[aa][cod], 'NT_Codon':nts_cfreq[aa][cod],'Obs_Trip':obs_tfreq[cod],'NT_Trip':nts_tfreq[cod], 'ObCod_Trip':ocd_tfreq[cod],'NTCod_Trip':ncd_tfreq[cod]} rje.delimitedFileOutput(self,tfile,headers,datadict=datadict) self.log.printLog('#OUT','Triplet & codon data output to %s' % tfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def codons(self): ### Main codons analysis method '''Main codons analysis method.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F'] cds = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd) gcode = rje_sequence.genetic_code ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nts = ['A', 'C', 'G', 'T'] ntfreq = cds.aaFreq(alphabet=nts) codons = [] # List of codons obs_cfreq = {} # Observed codon frequencies nts_cfreq = {} # Codon frequencies from NT frequencies obs_tfreq = {} # Observed triplet frequencies nts_tfreq = {} # Predicted triplet frequencies from NT frequencies ocd_tfreq = { } # Predicted triplet frequencies from observed codon frequencies ncd_tfreq = { } # Predicted triplet frequencies from nt-predicted codon frequencies ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for n1 in nts: for n2 in nts: for n3 in nts: cod = '%s%s%s' % (n1, n2, n3) codons.append(cod) aa = gcode[string.replace(cod, 'T', 'U')] if aa not in obs_cfreq: obs_cfreq[aa] = {} if aa not in nts_cfreq: nts_cfreq[aa] = {} obs_cfreq[aa][cod] = 0.0 nts_cfreq[aa][ cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] obs_tfreq[cod] = 0.0 nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] ocd_tfreq[cod] = 0.0 ncd_tfreq[cod] = 0.0 nts_tfreq = rje.dictFreq(nts_tfreq, total=False) # Normalise triplet freq. for aa in nts_cfreq: nts_cfreq[aa] = rje.dictFreq( nts_cfreq[aa], total=False) # Normalise codon freq. self.log.printLog('#FREQ', 'Frequency dictionaries set up.') ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (sx, stot) = (0.0, cds.seqNum()) for seq in cds.seq[0:]: self.log.printLog( '\r#OBS', 'Calculating observed codon frequencies: %.1f%%' % (sx / stot), newline=False, log=False) sx += 100.0 try: (id, scaffold, pos, name, glen, parent) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;', seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise try: exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)', pos)[0] except: try: exons = rje.matchExp('^join\((\d+\..*\.\d+)\)', pos)[0] except: exons = rje.matchExp('^(\d+\.\.\d+)', pos)[0] self.deBug(exons) exons = string.split(exons, ',') elen = [] try: for exon in exons: (start, end) = string.split(exon, '..') elen.append(string.atoi(end) - string.atoi(start) + 1) except: self.log.errorLog(id) cds.seq.remove(seq) continue if pos[:4] == 'comp': elen.reverse() seq.list['ExonLen'] = elen self.deBug(elen) if sum(elen) != seq.aaLen(): self.log.errorLog('%s exon length error' % id, printerror=False) if seq.aaLen() / 3 != seq.aaLen() / 3.0: self.log.errorLog('%s not a multiple of 3nt long!' % id, printerror=False) cds.seq.remove(seq) continue #!# Add use exon option - single full-length exon if false (mature mRNA) #!# sequence = seq.info['Sequence'][0:] if string.count(sequence, 'N') > 0: self.log.errorLog('%s has 1+ Ns!' % id, printerror=False) cds.seq.remove(seq) continue while sequence: cod = sequence[:3] sequence = sequence[3:] aa = gcode[string.replace(cod, 'T', 'U')] obs_cfreq[aa][cod] += 1 for aa in obs_cfreq: obs_cfreq[aa] = rje.dictFreq( obs_cfreq[aa], total=False) # Normalise codon freq. self.log.printLog( '\r#OBS', 'Calculating observed codon frequencies complete.') ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (sx, stot) = (0.0, cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#TRIP', 'Calculating triplet frequencies: %.1f%%' % (sx / stot), newline=False, log=False) sx += 100.0 elen = seq.list['ExonLen'] sequence = seq.info['Sequence'][0:] aa = '' cod = '' ax = 0 # Measure sequence length processed for exon boundary checks while sequence: prevcod = cod cod = sequence[:3] prevaa = aa sequence = sequence[3:] aa = gcode[string.replace(cod, 'T', 'U')] ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for cod2 in obs_cfreq[aa]: if elen[0] > ax + 3: # Exon boundary beyond this codon ocd_tfreq[cod2] += obs_cfreq[aa][cod2] ncd_tfreq[cod2] += nts_cfreq[aa][cod2] if prevaa: # Look at overlap with previous codon for cod1 in obs_cfreq[prevaa]: for i in range(1, 3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = cod1[i:] + cod2[:i] ocd_tfreq[acod] += ( obs_cfreq[prevaa][cod1] * obs_cfreq[aa][cod2]) ncd_tfreq[acod] += ( nts_cfreq[prevaa][cod1] * nts_cfreq[aa][cod2]) ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if elen[0] > ax + 3: # Exon boundary beyond this codon obs_tfreq[cod] += 1 if prevcod: # Look at overlap with previous codon for i in range(1, 3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = prevcod[i:] + cod[:i] obs_tfreq[acod] += 1 # Check exons # ax += 3 if ax >= elen[0]: ax -= elen.pop(0) obs_tfreq = rje.dictFreq(obs_tfreq, total=False) ocd_tfreq = rje.dictFreq(ocd_tfreq, total=False) ncd_tfreq = rje.dictFreq(ncd_tfreq, total=False) self.log.printLog('\r#TRIP', 'Calculating triplet frequencies complete.') ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = [ 'Triplet', 'AA', 'Degen', 'Obs_Codon', 'NT_Codon', 'Obs_Trip', 'NT_Trip', 'ObCod_Trip', 'NTCod_Trip' ] tfile = 'quad_triplet.tdt' rje.delimitedFileOutput(self, tfile, headers, rje_backup=True) for cod in codons: aa = gcode[string.replace(cod, 'T', 'U')] datadict = { 'Triplet': cod, 'AA': aa, 'Degen': len(obs_cfreq[aa]), 'Obs_Codon': obs_cfreq[aa][cod], 'NT_Codon': nts_cfreq[aa][cod], 'Obs_Trip': obs_tfreq[cod], 'NT_Trip': nts_tfreq[cod], 'ObCod_Trip': ocd_tfreq[cod], 'NTCod_Trip': ncd_tfreq[cod] } rje.delimitedFileOutput(self, tfile, headers, datadict=datadict) self.log.printLog('#OUT', 'Triplet & codon data output to %s' % tfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def mapPhosByBLAST(self,fasfile): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + ['seqin=%s' % fasfile,'autoload=T','autofilter=F'] qseqlist = rje_seq.SeqList(self.log,scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['','none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name','Pos','AA','PELM','PELMPos','Evidence'] delimit = rje.getDelimit(self.cmd_list,rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self,ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log,self.cmd_list+['formatdb=F']) pblast.setInfo({'Name':'%s.p.blast' % rje.baseFile(fasfile),'DBase':self.info['PELMFas'],'InFile':fasfile}) pblast.setStat({'HitAln':pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID','Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0,self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True,log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F','autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq:0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys(self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False,gaps=False) != hseq.getSequence(case=False,gaps=False): self.log.errorLog('Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec']) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0 aln.muscleAln() #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])) aln._addSeq('PhosAln','-' * qseq.seqLen()) aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']) ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## print '>>>\n', aln.seq, pdict.keys(), rdict.keys() for a in range(qseq.seqLen()): if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1 for hseq in pdict: if hseq.info['Sequence'][a] == '-': continue if hseq != qseq: rdict[hseq] += 1 if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][a] == hseq.info['Sequence'][a]: # Phosphosite pdata = {'Name':search.info['Name'],'Pos':rdict[qseq],'AA':qseq.info['Sequence'][a], 'PELM':hseq.shortName(),'PELMPos':rdict[hseq],'Evidence':'Hom'} if hseq == qseq: pdata['Evidence'] = 'Self' elif hseq in idlist: pdata['Evidence'] = 'ID' rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,pdata) self.addPhos(aln.seq[-1],a,pdata['Evidence']) ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## aln.saveFasta() # Align hits for each > X %ID # Map phosphosites onto alignment and output # return except: self.log.errorLog('Problem during PhosphoSeq.mapPhosByBLAST')
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for fasta in glob.glob('*.fasta'): fas = fasta[:-2] if os.path.exists(fas): continue sx = 0 for line in open(fasta, 'r').readlines(): if line[:1] == '>': try: (name, desc) = rje.matchExp('^>(\S+) (\S.+)$', line) except: name = rje.matchExp('^>(\S+)', line)[0] if len(string.split(name, '|')) == 3: name = '6rf_NEIME__%s' % string.split(name, '|')[2] open(fas, 'a').write('>%s\n' % name) elif len(string.split(name, '|')) == 5: name = 'ref_NEIME__%s' % string.split(name, '|')[3] open(fas, 'a').write('>%s %s\n' % (name, desc)) else: print string.split(name, '|') raise ValueError self.progLog( '\r#FAS', 'Processing %s: %s seqs' % (fas, rje.integerString(sx))) sx += 1 else: open(fas, 'a').write(line) self.printLog( '\r#FAS', 'Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta)) rje_blast.BLASTRun(self.log, self.cmd_list).formatDB(fas, protein=True, force=True) ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhits = {} # Dictionary of {hit:['File:hit_num']} acc = 'MC58_6RF_Hits.acc' open(acc, 'w') gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt' cx = 0 for csv in glob.glob('MC58_6RF_CSV/*.CSV'): cx += 1 file = os.path.basename(csv)[:-4] hits = False for line in open(csv, 'r').readlines(): if line.find('prot_hit_num,prot_acc') == 0: hits = True elif hits: data = rje.readDelimit(line, ',') if len(data) < 2: continue [num, name] = data[:2] try: name = string.split(name, '|')[2] except: continue if name not in rfhits: open(acc, 'a').write('6rf_NEIME__%s\n' % name) rfhits[name] = [] id = '%s:%s' % (file, num) if id not in rfhits[name]: rfhits[name].append(id) self.progLog( '\r#CSV', 'Reading %d CSV files: %s 6RF Hits' % (cx, rje.integerString(len(rfhits)))) self.printLog( '\r#CSV', 'Read %d CSV files: %s 6RF Hits output to %s' % (cx, rje.integerString(len(rfhits)), acc)) ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(gfile): seqlist = rje_seq.SeqList( self.log, self.cmd_list + [ 'seqin=%s' % acc, 'fasdb=MC58_6RF.fas', 'seqout=MC58_6RF_Hits.fas', 'autoload=T', 'accnr=F', 'seqnr=F' ]) seqlist.info['Name'] = 'MC58_6RF_Hits.fas' seqlist.saveFasta() gablam.GABLAM( self.log, self.cmd_list + [ 'seqin=MC58_6RF_Hits.fas', 'searchdb=MC58_1.fas', 'qryacc=F' ]).gablam() ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdata = rje.dataDict(self, gfile, ['Qry'], ['HitNum']) zeros = [] for hit in gdata: if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit) zeros = rje.sortUnique(zeros, False) open('6rf_zeros.acc', 'w').write(string.join(zeros, '\n')) self.printLog( '#ZERO', '%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros)) ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt' if not os.path.exists(ufile): seqlist = rje_seq.SeqList( self.log, self.cmd_list + [ 'seqin=6rf_zeros.acc', 'fasdb=MC58_6RF.fas', 'seqout=MC58_6RF_Zeros.fas', 'autoload=T', 'accnr=F', 'seqnr=F' ]) seqlist.info['Name'] = 'MC58_6RF_Zeros.fas' seqlist.saveFasta() gablam.GABLAM( self.log, self.cmd_list + [ 'seqin=MC58_6RF_Zeros.fas', 'searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas', 'qryacc=F' ]).gablam() gdata = rje.dataDict(self, ufile, ['Qry'], getheaders=True) fdata = rje.dataDict(self, string.replace(ufile, 'hitsum', 'gablam'), ['Qry'], ['Hit'], lists=True) headers = gdata.pop('Headers') headers.insert(1, 'Sample') headers.append('BestHit') rje.delimitedFileOutput(self, 'MC58_6RF_Zeros.tdt', headers, rje_backup=True) for rf in rje.sortKeys(gdata): rfcut = string.split(rf, '__')[1] gdata[rf]['Sample'] = string.join(rfhits[rfcut], '; ') gdata[rf]['Qry'] = rfcut try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0] except: gdata[rf]['BestHit'] = '-' rje.delimitedFileOutput(self, 'MC58_6RF_Zeros.tdt', headers, datadict=gdata[rf]) except: self.errorLog(rje_zen.Zen().wisdom()) self.printLog('#ZEN', rje_zen.Zen().wisdom())