Пример #1
0
 def readSLiMSearchOcc(self,motifs=[]):   ### Reads SLiMSearch results into data dictionary
     '''Reads SLiMSearch results into data dictionary.'''
     try:### ~ [1] Read ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not motifs: self.printLog('#OCC','Cannot process occurrences for No motifs!')
         occfile = '%s.csv' % self.info['ResFile']
         delimit = rje.delimitFromExt(filename=occfile)
         data = rje.dataDict(self,occfile,mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=string.split('Seq,Desc,Start_Pos,End_Pos,Cons,HomNum,GlobID,LocID,Hyd,SA',','))
         self.dict['Occ'] = {}
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (mx,ox,otot) = (0,0.0,len(data))
         for occ in data:
             self.progLog('\r#OCC','Processing occurrences (%d motifs): %.2f%%' % (mx,ox/otot)); ox += 100.0
             #x#self.deBug('%s vs MinHom %d' % (data[occ],self.stat['MinHom']))
             if string.atoi(data[occ]['HomNum']) < self.stat['MinHom']: continue
             (motif,seq,start,end) = string.split(occ,delimit)
             if motif not in motifs: continue
             try:
                 gene = rje.matchExp('gene:(\S+)\]',data[occ]['Desc'])[0]
                 self.deBug('%s:%s' % (gene,self.ensGO(gene)))
                 if not self.ensGO(gene): continue
             except: continue
             if motif[-3:] == 'rev': (motif,type) = (motif[:-4],'Rev')
             elif motif[-5:] == 'scram': (motif,type) = (motif[:-6],'Scr')
             else: type = 'ELM'
             if motif not in self.dict['Occ']: self.dict['Occ'][motif] = {}; mx += 1
             if type not in self.dict['Occ'][motif]: self.dict['Occ'][motif][type] = {}
             if gene not in self.dict['Occ'][motif][type]: self.dict['Occ'][motif][type][gene] = []
             self.dict['Occ'][motif][type][gene].append(data[occ])
         self.printLog('\r#OCC','Processed %s occurrences: %d motifs with GO-links' % (rje.integerString(otot),mx))
     except: self.log.errorLog(rje_zen.Zen().wisdom())
Пример #2
0
    def run(self,setup=True):  ### Main Run Method
        '''
        Main Run Method
        >> setup:bool [True] = Sets up headers and reads in existing data if present.
        '''
        try:
            ### ~ Setup & Read existing data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if setup: self.setup()
            headers = self.list['Headers']
            delimit = rje.delimitFromExt(filename=self.info['CardOut'])
            if os.path.exists(self.info['EnsLoci']):
                for h in ['EnsLoci','EnsDesc']:
                    if h not in headers: headers.append(h)
            rje.delimitedFileOutput(self,self.info['CardOut'],headers,delimit,rje_backup=True)

            ### ~ Read EnsLoci for incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.ensLoci()
                        
            ### ~ Parse data from GeneCards website and/or previously read aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.processGenes(self.list['Genes'])
            self.interactiveUpdate()
        
            ### ~ Add EnsEMBL EnsLoci data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.addEnsLoci()

            ### ~ Output GeneCards data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.outputCards()
            
        except:
            self.log.errorLog('Apocalyptic error with GeneCards.run()')
            raise
Пример #3
0
 def outputCards(self):  ### Outputs cards to delimited file
     '''Outputs cards to delimited file.'''
     ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     genelist = self.list['Genes']
     if self.opt['Purify'] and self.opt['Restrict']:
         for gene in genelist[0:]:
             if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']:  # Replace with symbol
                 genelist.remove(gene)
                 if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol'])
     delimit = rje.delimitFromExt(filename=self.info['CardOut'])
     CARDOUT = open(self.info['CardOut'],'a')
     ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     (noens,noloci,ox) = (0,0,0)
     for gene in rje.sortKeys(self.dict['GeneCard']):
         if self.opt['Restrict'] and gene not in genelist: continue
         elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue
         self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1
         self.dict['GeneCard'][gene]['Alias'] = gene
         self.dict['GeneCard'][gene]['Species'] = self.info['Species']
         rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene])
         if self.dict['GeneCard'][gene]['Symbol'] == gene:   # Not an alias
             if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1
             if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1
     CARDOUT.close()
     self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut']))
     self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))
Пример #4
0
 def loadFeatures(self,ftfile):  ### Loads features from given file
     '''Loads features from given file.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if ftfile in ['','none']: return
         if not os.path.exists(ftfile): return self.printLog('#ERR','Features file "%s" missing')
         delimit = rje.delimitFromExt(filename=ftfile)
         ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         headers = rje.readDelimit(open(ftfile,'r').readline(),delimit)
         mainkeys = [headers[0]]
         hmap = {}
         for h in headers: hmap[h.lower()] = h
         pos = ''    # Leader for start/end positions
         if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_'
         for h in ['feature','%sstart' % pos,'%send' % pos,'description']:
             if h not in hmap: return self.printLog('#ERR','No %s field detected in "%s" features file' % (h,ftfile))
             mainkeys.append(hmap[h])
         mainkeys.remove(hmap['description'])
         ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ftdata = rje.dataDict(self,ftfile,mainkeys,['description'],delimit,headers,lists=True)
         (mx,mtot,fx) = (0.0,len(ftdata),0)
         for mainkey in rje.sortKeys(ftdata):
             self.progLog('\r#FT','Loading features from %s: %.2f%%' % (ftfile,mx/mtot))
             mx += 100.0                                                                           
             (id,ft,start,end) = string.split(mainkey,delimit)
             if id == mainkeys[0]: continue
             if id not in self.dict['Features']: self.dict['Features'][id] = []
             for desc in ftdata[mainkey][hmap['description']]:
                 fx += 1
                 self.dict['Features'][id].append({'Type':ft,'Start':int(start),'End':int(end),'Desc':desc})
         self.printLog('\r#FT','Loaded %s features for %s IDs from %s' % (rje.integerString(fx),rje.integerString(len(self.dict['Features'])),ftfile))
     except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
Пример #5
0
 def loadFeatures(self, ftfile):  ### Loads features from given file
     '''Loads features from given file.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if ftfile in ['', 'none']: return
         if not os.path.exists(ftfile):
             return self.printLog('#ERR', 'Features file "%s" missing')
         delimit = rje.delimitFromExt(filename=ftfile)
         ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         headers = rje.readDelimit(open(ftfile, 'r').readline(), delimit)
         mainkeys = [headers[0]]
         hmap = {}
         for h in headers:
             hmap[h.lower()] = h
         pos = ''  # Leader for start/end positions
         if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_'
         for h in [
                 'feature',
                 '%sstart' % pos,
                 '%send' % pos, 'description'
         ]:
             if h not in hmap:
                 return self.printLog(
                     '#ERR', 'No %s field detected in "%s" features file' %
                     (h, ftfile))
             mainkeys.append(hmap[h])
         mainkeys.remove(hmap['description'])
         ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ftdata = rje.dataDict(self,
                               ftfile,
                               mainkeys, ['description'],
                               delimit,
                               headers,
                               lists=True)
         (mx, mtot, fx) = (0.0, len(ftdata), 0)
         for mainkey in rje.sortKeys(ftdata):
             self.progLog(
                 '\r#FT',
                 'Loading features from %s: %.2f%%' % (ftfile, mx / mtot))
             mx += 100.0
             (id, ft, start, end) = string.split(mainkey, delimit)
             if id == mainkeys[0]: continue
             if id not in self.dict['Features']:
                 self.dict['Features'][id] = []
             for desc in ftdata[mainkey][hmap['description']]:
                 fx += 1
                 self.dict['Features'][id].append({
                     'Type': ft,
                     'Start': int(start),
                     'End': int(end),
                     'Desc': desc
                 })
         self.printLog(
             '\r#FT', 'Loaded %s features for %s IDs from %s' %
             (rje.integerString(fx),
              rje.integerString(len(self.dict['Features'])), ftfile))
     except:
         self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
Пример #6
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['Basefile'].lower() in ['','none']: self.info['Basefile'] = ''
         elif self.info['Basefile'][-1] != '.': self.info['Basefile'] += '.'
         self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T'])
         self.list['PlotFT'] = string.split(string.join(self.list['PlotFT']).upper())
         if self.info['OccFile'].lower() not in ['','none']:
             self.info['Delimit'] = rje.delimitFromExt(filename=self.info['OccFile'])
             self.dict['OccData'] = {}
             occdata = rje.dataDict(self,self.info['OccFile'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'])
             for key in rje.sortKeys(occdata):
                 seq = occdata[key].pop('Seq')
                 if seq not in self.dict['OccData']: self.dict['OccData'][seq] = {}
                 dataset = occdata[key].pop('Dataset')
                 if dataset not in self.dict['OccData'][seq]: self.dict['OccData'][seq][dataset] = []
                 self.dict['OccData'][seq][dataset].append(occdata[key])
             self.printLog('#OCC','Loaded data for %s occurrences in %s sequences' % (rje.integerString(len(occdata)),rje.integerString(len(self.dict['OccData']))))
             self.obj['SeqList'].autoFilter(['GoodSeq=%s' % string.join(rje.sortKeys(self.dict['OccData']),',')])
         ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['PlotStat'] = string.split(string.join(self.list['PlotStat']).lower())
         if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc = rje_slimcalc.SLiMCalc(self.log,self.cmd_list)
         seqdict = self.obj['SeqList'].seqNameDic()
         for name in rje.sortKeys(seqdict):
             if self.opt['OccOnly'] and not name in self.dict['OccData']: continue
             seq = seqdict[name]
             sequence = seq.getSequence(gaps=False)
             seq.dict['PlotStat'] = {}
             if 'sa' in self.list['PlotStat']: seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(sequence,returnlist=True)
             if 'hyd' in self.list['PlotStat']: seq.dict['PlotStat']['Hydropathy'] = rje_seq.eisenbergHydropathy(sequence,returnlist=True)
             if 'dis' in self.list['PlotStat']: seq.dict['PlotStat']['Disorder'] = seq.disorder(returnlist=True)
             if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']:
                 slimcalc.relConListFromSeq(seq,slimcalc.stat['RelConWin'],store=True)
                 try:
                     seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons')
                     seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop('RelCons')
                 except: self.printLog('#CONS','No conservation stats for %s' % name)
             self.printLog('#STAT','PlotStats calculated for %s' % name)
             for stat in seq.dict['PlotStat']:
                 if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0: seq.dict['PlotStat'][stat] = self.plotWin(seq.dict['PlotStat'][stat])
                 seq.dict['PlotStat'][stat] = self.convertStat(seq.dict['PlotStat'][stat])
             self.printLog('#STAT','PlotStats converted for %s' % name)                
         ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             if name in self.dict['OccData']:
                 for dataset in self.dict['OccData'][name]:
                     ofile = '%s%s.%s.plot.txt' % (self.info['Basefile'],dataset,seq.info['AccNum'])
                     self.output(seq,ofile,self.dict['OccData'][name][dataset])
             else: self.output(seq,'%s%s.plot.txt' % (self.info['Basefile'],seq.info['AccNum']))
         return
     except: self.errorLog(rje_zen.Zen().wisdom())
Пример #7
0
 def restOutput(self,outfmt=None,maxparsesize=0,asjson=False):    ### Returns rest output for outfmt
     '''Returns rest output for outfmt.'''
     if not outfmt: outfmt = self.getStrLC('Rest')
     if not outfmt: self.jsonText('No REST output',asjson)
     if outfmt in self.dict['Output']:
         rfile = string.split(self.dict['Output'][outfmt],'\n')[0]
         if rje.exists(rfile):
             fext = string.split(rfile,'.')[-1]
             if fext in ['png']:
                 self.debug(rfile)
                 self.jsonText(rfile,asjson)
             nbytes = os.path.getsize(rfile)
             if nbytes > maxparsesize > 0:   # Too large to parse
                 otext = '%s is too large to return (%s > %s)' % (os.path.basename(rfile),rje.humanByteSize(nbytes),rje.humanByteSize(maxparsesize))
                 try: jobid = self.dict['Output']['jobid']
                 except: jobid = None
                 resturl = '%sretrieve&jobid=%s&rest=%s[&password=X]' % (self.getStr('RestURL'),jobid,outfmt)
                 if not jobid or outfmt == self.getStrLC('Rest'): return self.jsonText('ERROR: %s' % (otext),asjson)
                 else: return self.jsonText('%s in full output. Try %s.' % (otext,resturl),asjson)
             else:
                 delimit = rje.delimitFromExt(filename=rfile,write=False)
                 if asjson and delimit in [',','\t']:
                     jtext = []
                     for rline in open(rfile,'r').readlines():
                         jtext.append(json.dumps(rje.readDelimit(rline,delimit)))
                     return '[%s]' % string.join(jtext,',\n        ')
                 #!# Add json parsing of fasta files?
                 else:
                     outtxt = open(rfile,'r').read()
                     if not outtxt.endswith('\n'): outtxt += '\n'
                     return self.jsonText(outtxt,asjson)
         elif asjson and outfmt in self.dict['Outfile']:
             pass    #!# Sort out json formatting here based on file extension!
         return self.dict['Output'][outfmt]
     elif outfmt in ['parse','format']:
         intro = '<pre>%s</pre>\n\n' % self.restOutput('intro')
         return self.jsonText(intro,asjson)
     elif outfmt in ['default','full']: return self.jsonText(self.restFullOutput(maxparsesize),asjson)
     elif outfmt in ['restkeys','outputs']: return string.join(self.list['RestKeys']+[''],'\n')
     return self.jsonText('No %s output generated.' % outfmt,asjson)
Пример #8
0
    def mapPhosByBLAST(self,fasfile):   ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + ['seqin=%s' % fasfile,'autoload=T','autofilter=F']
            qseqlist = rje_seq.SeqList(self.log,scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['','none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name','Pos','AA','PELM','PELMPos','Evidence']
            delimit = rje.getDelimit(self.cmd_list,rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self,ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,self.cmd_list+['formatdb=F'])
            pblast.setInfo({'Name':'%s.p.blast' % rje.baseFile(fasfile),'DBase':self.info['PELMFas'],'InFile':fasfile})
            pblast.setStat({'HitAln':pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key']
            for g in ['ID','Hom']:
                if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0,self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True,log=True)    # BLAST
            pblast.readBLAST(gablam=True)               # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F','autofilter=F'])
                aln.seq = [qseq]
                pdict = {}      # Dictionary of {hseq:[poslist]}
                rdict = {qseq:0}      # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,gaps=False) != hseq.getSequence(case=False,gaps=False):
                            self.log.errorLog('Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']) and qvh >= self.stat['IDSim']: idlist.append(hseq)
                    rdict[hseq] = 0
                aln.muscleAln()   #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']))
                aln._addSeq('PhosAln','-' * qseq.seqLen())
                aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])
                ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                print '>>>\n', aln.seq, pdict.keys(), rdict.keys()
                for a in range(qseq.seqLen()):
                    if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1
                    for hseq in pdict:
                        if hseq.info['Sequence'][a] == '-': continue
                        if hseq != qseq: rdict[hseq] += 1
                        if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][a] == hseq.info['Sequence'][a]:  # Phosphosite
                            pdata = {'Name':search.info['Name'],'Pos':rdict[qseq],'AA':qseq.info['Sequence'][a],
                                     'PELM':hseq.shortName(),'PELMPos':rdict[hseq],'Evidence':'Hom'}
                            if hseq == qseq: pdata['Evidence'] = 'Self'
                            elif hseq in idlist: pdata['Evidence'] = 'ID'
                            rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,pdata)
                            self.addPhos(aln.seq[-1],a,pdata['Evidence'])
                ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                aln.saveFasta()


            # Align hits for each > X %ID
            # Map phosphosites onto alignment and output #
            
            return
        except: self.log.errorLog('Problem during PhosphoSeq.mapPhosByBLAST')
Пример #9
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['Basefile'].lower() in ['', 'none']:
             self.info['Basefile'] = ''
         elif self.info['Basefile'][-1] != '.':
             self.info['Basefile'] += '.'
         self.obj['SeqList'] = rje_seq.SeqList(
             self.log, self.cmd_list + ['autoload=T'])
         self.list['PlotFT'] = string.split(
             string.join(self.list['PlotFT']).upper())
         if self.info['OccFile'].lower() not in ['', 'none']:
             self.info['Delimit'] = rje.delimitFromExt(
                 filename=self.info['OccFile'])
             self.dict['OccData'] = {}
             occdata = rje.dataDict(
                 self, self.info['OccFile'],
                 ['Seq', 'Dataset', 'Pattern', 'Start_Pos', 'End_Pos'],
                 ['Seq', 'Dataset', 'Pattern', 'Start_Pos', 'End_Pos'])
             for key in rje.sortKeys(occdata):
                 seq = occdata[key].pop('Seq')
                 if seq not in self.dict['OccData']:
                     self.dict['OccData'][seq] = {}
                 dataset = occdata[key].pop('Dataset')
                 if dataset not in self.dict['OccData'][seq]:
                     self.dict['OccData'][seq][dataset] = []
                 self.dict['OccData'][seq][dataset].append(occdata[key])
             self.printLog(
                 '#OCC', 'Loaded data for %s occurrences in %s sequences' %
                 (rje.integerString(len(occdata)),
                  rje.integerString(len(self.dict['OccData']))))
             self.obj['SeqList'].autoFilter([
                 'GoodSeq=%s' %
                 string.join(rje.sortKeys(self.dict['OccData']), ',')
             ])
         ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['PlotStat'] = string.split(
             string.join(self.list['PlotStat']).lower())
         if 'cons' in self.list['PlotStat'] or 'rel' in self.list[
                 'PlotStat']:
             slimcalc = rje_slimcalc.SLiMCalc(self.log, self.cmd_list)
         seqdict = self.obj['SeqList'].seqNameDic()
         for name in rje.sortKeys(seqdict):
             if self.opt['OccOnly'] and not name in self.dict['OccData']:
                 continue
             seq = seqdict[name]
             sequence = seq.getSequence(gaps=False)
             seq.dict['PlotStat'] = {}
             if 'sa' in self.list['PlotStat']:
                 seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(
                     sequence, returnlist=True)
             if 'hyd' in self.list['PlotStat']:
                 seq.dict['PlotStat'][
                     'Hydropathy'] = rje_seq.eisenbergHydropathy(
                         sequence, returnlist=True)
             if 'dis' in self.list['PlotStat']:
                 seq.dict['PlotStat']['Disorder'] = seq.disorder(
                     returnlist=True)
             if 'cons' in self.list['PlotStat'] or 'rel' in self.list[
                     'PlotStat']:
                 slimcalc.relConListFromSeq(seq,
                                            slimcalc.stat['RelConWin'],
                                            store=True)
                 try:
                     seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons')
                     seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop(
                         'RelCons')
                 except:
                     self.printLog('#CONS',
                                   'No conservation stats for %s' % name)
             self.printLog('#STAT', 'PlotStats calculated for %s' % name)
             for stat in seq.dict['PlotStat']:
                 if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0:
                     seq.dict['PlotStat'][stat] = self.plotWin(
                         seq.dict['PlotStat'][stat])
                 seq.dict['PlotStat'][stat] = self.convertStat(
                     seq.dict['PlotStat'][stat])
             self.printLog('#STAT', 'PlotStats converted for %s' % name)
             ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             if name in self.dict['OccData']:
                 for dataset in self.dict['OccData'][name]:
                     ofile = '%s%s.%s.plot.txt' % (
                         self.info['Basefile'], dataset, seq.info['AccNum'])
                     self.output(seq, ofile,
                                 self.dict['OccData'][name][dataset])
             else:
                 self.output(
                     seq, '%s%s.plot.txt' %
                     (self.info['Basefile'], seq.info['AccNum']))
         return
     except:
         self.errorLog(rje_zen.Zen().wisdom())
Пример #10
0
    def mapPhosByBLAST(
        self, fasfile
    ):  ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology)
        '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scmd = self.cmd_list + [
                'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F'
            ]
            qseqlist = rje_seq.SeqList(self.log, scmd)
            qdict = qseqlist.seqNameDic()
            ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            basefile = rje.baseFile(fasfile)
            if self.info['PhosRes'].lower() in ['', 'none']:
                self.info['PhosRes'] = '%s.phosres.tdt' % basefile
            headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence']
            delimit = rje.getDelimit(
                self.cmd_list,
                rje.delimitFromExt(filename=self.info['PhosRes']))
            rje.delimitedFileOutput(self,
                                    self.info['PhosRes'],
                                    headers,
                                    delimit,
                                    rje_backup=True)
            ppath = rje.makePath('PhosALN')
            rje.mkDir(self, ppath)
            ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            pblast = rje_blast.BLASTRun(self.log,
                                        self.cmd_list + ['formatdb=F'])
            pblast.setInfo({
                'Name': '%s.p.blast' % rje.baseFile(fasfile),
                'DBase': self.info['PELMFas'],
                'InFile': fasfile
            })
            pblast.setStat({'HitAln': pblast.stat['OneLine']})
            pblast.opt['Complexity Filter'] = False
            pblast.formatDB(force=False)
            ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            gkey = 'GABLAMO ID'  #x# % self.info['GABLAMO Key']
            for g in ['ID', 'Hom']:
                if self.stat['%sSim' % g] < 1.0:
                    self.stat['%sSim' % g] *= 100.0
                self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g])

            ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pblast.blast(use_existing=True, log=True)  # BLAST
            pblast.readBLAST(gablam=True)  # Read in
            while pblast.search:
                ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                search = pblast.search.pop(0)
                qseq = qdict[search.info['Name']]
                idlist = []
                qlen = qseq.aaLen()
                hitdict = search.hitSeq(self.obj['SeqList'])
                aln = rje_seq.SeqList(
                    self.log, self.cmd_list + ['autoload=F', 'autofilter=F'])
                aln.seq = [qseq]
                pdict = {}  # Dictionary of {hseq:[poslist]}
                rdict = {qseq: 0}  # Dictionary of {hseq:res}
                for hit in search.hit[0:]:
                    hseq = hitdict[hit]
                    pdict[hseq] = []
                    for pos in rje.sortKeys(
                            self.dict['PhosphoSites'][hseq.info['AccNum']]):
                        pdict[hseq].append(pos)
                    if hit.info['Name'] == search.info['Name']:
                        if qseq.getSequence(case=False,
                                            gaps=False) != hseq.getSequence(
                                                case=False, gaps=False):
                            self.log.errorLog(
                                'Major problem: Search/Hit sequence mismatch for same sequence "%s"'
                                % hit.info['Name'])
                        idlist.append(qseq)
                        pdict[qseq] = pdict.pop(hseq)
                        continue
                    gdict = hit.globalFromLocal(qlen)
                    qvh = float(100 * gdict['Query'][gkey]) / float(qlen)
                    if qvh < self.stat['HomSim']:
                        pdict.pop(hseq)
                        continue
                    aln.seq.append(hseq)
                    if (qseq.sameSpec(hseq) or not self.opt['UseSpec']
                        ) and qvh >= self.stat['IDSim']:
                        idlist.append(hseq)
                    rdict[hseq] = 0
Пример #11
0
    def picsi(self):    ### Cleans up cross-species search results
        '''Cleans up cross-species search results.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datafile = self.info['SumFile']
            delimit = rje.delimitFromExt(filename=self.info['SumFile'])
            data = {}       # search:{hit:{???}}
            pep2prot = {}   # search:{peptide:[hits]}
            id2prot = {}    # search:{id:hit}
            prot2desc = {}
            fullpeplist = {}    
            pepcon = {}     # Convert pep:longer pep
            speclist = []   # List of species codes
            ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True)
            for ikey in rje.sortKeys(indata):
                (search,id) = string.split(ikey,delimit)
                prot = indata[ikey]['prot_acc'][0]
                desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','')
                if desc[3:7] == 'Name': desc = desc[9:]
                prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc))
                indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|')
                pepconv = string.replace(indata[ikey]['pep_seq'],'I','L')
                pepconv = string.replace(pepconv,'Q','K')
                peplist = rje.sortUnique(string.split(pepconv,'|'))
                indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|')
                if search not in data:
                    data[search] = {}
                    pep2prot[search] = {}
                    id2prot[search] = {}
                    fullpeplist[search] = []
                    pepcon[search] = {}
                fullpeplist[search] += peplist
                id2prot[search][id] = prot
                spec = string.split(prot,'_')[1]
                if spec not in speclist: speclist.append(spec)
                data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec,
                                      'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:],
                                      'pep_rem':0}
                try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum']
                except: data[search][prot]['accnum'] = string.split(prot,'__')[-1]
                for pep in peplist:
                    if pep not in pep2prot[search]:
                        pep2prot[search][pep] = []
                    pep2prot[search][pep].append(prot)
            ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for search in fullpeplist:
                fullpeplist[search] = rje.sortUnique(fullpeplist[search])
                for pep in fullpeplist[search][0:]:
                    for pep2 in fullpeplist[search]:
                        if pep != pep2 and pep in pep2:
                            pepcon[search][pep] = pep2
                            fullpeplist[search].remove(pep)
                            break
                for pep in pepcon[search]:
                    while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]]
                self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search))
                #self.deBug(pepcon[search])
                #self.deBug(rje.sortKeys(pep2prot[search]))
                pp = 0; pm = 0
                for prot in data[search]:
                    for pep in data[search][prot]['conpep'][0:]:
                        if pep in pepcon[search]:
                            newpep = pepcon[search][pep]
                            if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1
                            data[search][prot]['conpep'].remove(pep); pm += 0
                            if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot)
                            if pep in pep2prot[search]: pep2prot[search].pop(pep)
                    data[search][prot]['pep_con'] = len(data[search][prot]['conpep'])
                self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm))
            ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for search in pep2prot:
            ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                remx = 0
                for prot in data[search]:
                    if data[search][prot]['spec'] != self.info['QrySpec']: continue
                    for pep in data[search][prot]['conpep']:
                        for prot2 in pep2prot[search][pep][0:]:
                            if data[search][prot2]['spec'] == self.info['QrySpec']: continue
                            pep2prot[search][pep].remove(prot2)
                            data[search][prot2]['conpep'].remove(pep)
                            data[search][prot2]['pep_rem'] += 1; remx += 1
                self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec']))
            ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                for prot in data[search]:
                    if len(data[search][prot]['conpep']) < 2:
                        for pep in data[search][prot]['conpep']:
                            #if pep in pep2prot[search] and prot in pep2prot[search][pep]:
                            pep2prot[search][pep].remove(prot)
            ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ux = 0
                for pep in pep2prot[search]:
                    #self.deBug(pep)
                    if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1
                self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search))
            ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0}
                rx = 0
                for prot in data[search]:
                    #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue
                    data[search][prot]['pep_red'] = 0   # Redundant peptides found in proteins with unique peptides
                    data[search][prot]['pep_nr'] = 0    # Redundant peptides found only in proteins without unique peptides
                    for pep in data[search][prot]['conpep']:
                        if pep2prot[search][pep] == [prot]: continue
                        upep = False
                        for prot2 in pep2prot[search][pep]:
                            if data[search][prot2]['pep_uniq']: upep = True; break
                        if upep: data[search][prot]['pep_red'] += 1     # Redundant peptide found in unique protein
                        else: data[search][prot]['pep_nr'] += 1         # Redundant peptide NOT found in unique protein
                    if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1
                    elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE'
                    elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR'
                    else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1
                    summary[data[search][prot]['class']] += 1
                self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search))
                for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x))

            ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            speclist.sort()
            species = {}
            for spec in speclist:
                try:
                    grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read()
                    species[spec] = string.split(grep,':')[-4]
                    self.printLog('#SPEC','%s = %s' % (spec,species[spec]))
                except: species[spec] = '?'

            ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile'])
            headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep']
            if self.dict['Acc2Seq']: headers.insert(3,'cluster')
            rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True)
            for search in rje.sortKeys(data):
                if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search])
                for prot in rje.sortKeys(data[search]):
                    if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']):
                        data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1]
                    else: data[search][prot]['species'] = species[data[search][prot]['spec']]                                                                               
                    rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot])
                                
        except: self.errorLog('Errg')