예제 #1
0
 def makeGOFile(self):   ### Maps GO to sequences and outputs table for R analysis
     '''Maps GO to sequences and outputs table for R analysis.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         outfile = '%s.goer.tdt' % self.info['ResFile']
         headers = ['GOID','Motif','Type','Gene','Cons','HomNum','GlobID','LocID','Hyd','SA']
         rje.delimitedFileOutput(self,outfile,headers,rje_backup=True)
         ### ~ [2] ~ Work through dictionary and output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (mx,mtot) = (-100.0,len(self.dict['Occ']))
         for motif in rje.sortKeys(self.dict['Occ']):
             mx += 100.0; self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|CheckSeq)         ' % (outfile,(mx/mtot),motif))
             ## ~ [2a] ~ Check MinOcc in terms of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             for type in rje.sortKeys(self.dict['Occ'][motif]):
                 if len(self.dict['Occ'][motif][type]) < self.stat['MinOcc']: self.dict['Occ'][motif].pop(type)
             if 'ELM' not in self.dict['Occ'][motif] or len(self.dict['Occ'][motif]) < 2: continue
             for type in self.dict['Occ'][motif]:
                 ## ~ [2b] ~ Map GO terms and check MinOcc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|Check%s) ' % (outfile,(mx/mtot),motif,type)); 
                 godict = {}     # Temp dictionary of {GOID:[Seqs]}
                 for gene in self.dict['Occ'][motif][type]:
                     for go in self.ensGO(gene):
                         if go not in godict: godict[go] = [gene]
                         else: godict[go].append(gene)
                 self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|OccGO%s) ' % (outfile,(mx/mtot),motif,type)); 
                 for go in rje.sortKeys(godict):
                     if len(godict[go]) < self.stat['MinOcc']: godict.pop(go)
                 ## ~ [2c] ~ Output remaining GO terms occurrences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.progLog('\r#OUT','Generating %s output: %.1f%% (%s|Output%s)' % (outfile,(mx/mtot),motif,type)); 
                 for go in rje.sortKeys(godict):
                     for gene in godict[go]:
                         for occdict in self.dict['Occ'][motif][type][gene]:
                             datadict = rje.combineDict({'GOID':'GO:%s' % go,'Motif':motif,'Type':type,'Gene':gene},occdict)
                             rje.delimitedFileOutput(self,outfile,headers,datadict=datadict)
             self.printLog('#OUT','Output for %s %s complete.' % (motif,rje.sortKeys(self.dict['Occ'][motif])),screen=False)
         self.printLog('\r#OUT','Generating %s output complete!         ' % (outfile))
     except: self.log.errorLog(rje_zen.Zen().wisdom())
예제 #2
0
    def saveFasta(self):  ### Outputs parsed PPI datasets in Fasta format
        '''Outputs parsed PPI datasets in Fasta format.'''
        try:
            ### Setup ###
            datpath = self.info['OutDir'] + rje.makePath('HPRD_Datasets/')
            rje.mkDir(self, datpath)
            ## Check Seqs ##
            for p1 in rje.sortKeys(self.dict['PPI']):
                if 'Seq' not in self.dict['HPRD'][p1]:  #!# KeyError #!#
                    print p1, self.dict['HPRD'][p1]
                    self.deBug('No Seq for %s' % p1)

            ### All sequences ###
            self.obj['SeqList'].saveFasta()
            ### Output PPI Datasets ###
            for p1 in rje.sortKeys(self.dict['PPI']):
                mylist = []
                for p2 in self.dict['PPI'][p1]:
                    if self.opt['AllIso']:
                        mylist += self.dict['HPRD'][p2]['Seq']
                    else:
                        mylist.append(self.dict['HPRD'][p2]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath,
                                           self.dict['HPRD'][p1]['gene'])
                if mylist:
                    self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile)
            self.log.printLog('#FAS', 'HPRD PPI fasta output complete.')
        except:
            self.log.errorLog('Error in HPRD.saveFasta()',
                              printerror=True,
                              quitchoice=False)
예제 #3
0
 def ddi(self):  ### Domain-domain interactions
     '''Domain-domain interactions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ddx = 0
         (dx,dtot) = (0.0,len(self.dict['DDI']))
         if not self.dict['DDI'] or not self.dict['Domain']: return
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['DDI']):
             self.progLog('\r#DDI','Screening domain-domain interactions: %.1f%%; %s removed' % ((dx/dtot),rje.integerString(ddx))); dx += 100
             if dom not in self.dict['Domain']: self.printLog('#DOM','No sequences with "%s" domains' % dom); continue
             for ddi in self.dict['DDI'][dom]:
                 if ddi not in self.dict['Domain']: continue
                 for hub in self.dict['Domain'][dom]:
                     if hub not in self.dict['PPI']: continue
                     for spoke in self.dict['PPI'][hub][0:]:
                         if spoke in self.dict['Domain'][ddi]: ddx+=1; self.dict['PPI'][hub].remove(spoke)
                 for hub in self.dict['Domain'][ddi]:
                     if hub not in self.dict['PPI']: continue
                     for spoke in self.dict['PPI'][hub][0:]:
                         if spoke in self.dict['Domain'][dom]: ddx+=1; self.dict['PPI'][hub].remove(spoke)
         self.printLog('\r#DDI','Screening domain-domain interactions complete: %s removed.' % (rje.integerString(ddx)))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DDI','No %s interactions left after DDI removed' % hub,screen=False)
         self.printLog('#PPX','%s of %s PPI hubs remain after DDI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx)))
     except: self.errorLog('Problem with SLiMPID.ddi()',quitchoice=True)
예제 #4
0
 def dpi(self):  ### Domain-protein interactions
     '''Domain-protein interactions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_DPI'
         rje.mkDir(self, outdir)
         dpi = {}  # Dictionary of {domain:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['Domain']):
             dpi[dom] = []
             for hub in self.dict['Domain'][dom]:
                 if hub in self.dict['PPI']:
                     dpi[dom] += self.dict['PPI'][
                         hub]  # Add with redundancy
             for spoke in dpi[dom][0:]:
                 if dpi[dom].count(spoke) == 1:
                     dpi[dom].remove(
                         spoke)  # Must have 2+ domain interactions
             for hub in self.dict['Domain'][dom]:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in dpi[dom]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict[
                                 'PPI'][spoke]:
                             self.dict['PPI'][spoke].remove(hub)
             dpi[dom] = rje.sortUnique(dpi[dom], False, False)
             acc = []
             for name in dpi[dom]:
                 if not name: continue
                 if name in self.dict['Seq']:
                     acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname:
                     badname.append(name)
             open('%s/%s.dpi.acc' % (outdir, dom),
                  'w').write(string.join(acc, '\n'))
             self.printLog('#DPI',
                           '%s domain => %d interactors' % (dom, len(acc)))
         if badname:
             badname.sort()
             self.printLog(
                 '#BAD', '%d "bad" protein names: %s' %
                 (len(badname), string.join(badname, '; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DPI',
                           'No %s PPI left after DPI removed' % hub,
                           screen=False)
         self.printLog(
             '#PPX', '%s of %s PPI hubs remain after DPI removed' %
             (rje.integerString(len(
                 self.dict['PPI'])), rje.integerString(hx)))
     except:
         self.errorLog('Problem with SLiMPID.dpi()', quitchoice=True)
예제 #5
0
    def readPELM(self): ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.
        '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.'''
        try:### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            data = rje.dataDict(self,self.info['PELM'],mainkeys=['acc','position'])
            seqdict = {}    # Dictionary of Acc:Sequence

            ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pdict = self.dict['PhosphoSites']
            for dkey in data:
                ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                (acc,pos) = string.split(dkey)
                pos = string.atoi(pos)
                if acc not in pdict: pdict[acc] = {}
                if pos not in pdict[acc]: pdict[acc][pos] = {}
                ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if acc not in seqdict: seqdict[acc] = data[dkey]['sequence']
                elif seqdict[acc] != data[dkey]['sequence']: self.log.printLog('#ERR','Warning. Sequence mismatch for %s' % acc)
                if 'aa' not in pdict[acc][pos]: pdict[acc][pos]['aa'] = data[dkey]['code']
                elif pdict[acc][pos]['aa'] != data[dkey]['code']: self.log.printLog('#ERR','Warning. PhosphoSite mismatch for %s at pos %d: %s not %s' % (acc,pos,data[dkey]['code'],pdict[acc][pos]['aa']))
                if data[dkey]['code'] != seqdict[acc][(pos-1):pos]: self.log.printLog('#ERR','Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s' % (acc,pos,data[dkey]['code'],seqdict[acc][pos-1:pos]))

            ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            acclist = rje.sortKeys(seqdict)
            pelmuni = rje_uniprot.UniProt(self.log,self.cmd_list)   # UniProt entry
            unidict = pelmuni.accDict(acclist)        # Dictionary of {acc:UniProtEntry}
            pelmseq = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None'])            # SeqList object
            ## ~ [3b] Add one sequence for each AccNum and update seqdict  ~~~~~~~~~~~~~~~~~~~~~~~~ ##
            #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!#
            for acc in acclist:     #!# Make accdict of {acc:Seq} using unidict and seqlist #!#
                sequence = seqdict[acc]
                try:
                    uni = unidict[string.split(acc,'-')[0]]
                    desc = uni.obj['Sequence'].info['Description']
                    name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'],acc,desc)
                    if sequence != uni.obj['Sequence'].info['Sequence']:
                        self.log.printLog('#WARNING','Sequence mismatch for UniProt entry %s' % acc)
                except:
                    self.log.errorLog('Problem with %s' % acc)
                    name = '%s_UNK__%s' % (acc,acc)             #!# Add sequences where UniProt missing #!#
                seqdict[acc] = pelmseq._addSeq(name,sequence)
            ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if self.opt['FilterSeq']:
                pelmseq.autoFilter()
                for acc in acclist:
                    if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc)
                acclist = rje.sortKeys(seqdict)
            ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not os.path.exists(self.info['PELMFas']) or self.stat['Interactive'] < 0 or rje.yesNo('%s exists: overwrite?' % self.info['PELMFas']):
                pelmseq.saveFasta(seqfile=self.info['PELMFas'])
            self.obj['SeqList'] = pelmseq
            self.obj['UniProt'] = pelmuni
        except: self.log.errorLog('Problem during PhosphoSeq.readPELM')
예제 #6
0
 def setup(self):    ### Main class setup method. Makes sumfile if necessary.
     '''Main class setup method. Makes sumfile if necessary.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile'))
         if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile']))
         if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile()
         self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile'))
         if os.path.exists(self.info['SumFile']) and not self.opt['Force']:
             if rje.yesNo('%s found. Use these results?' % self.info['SumFile']):
                 return self.printLog('#SUM','Summary results file found. No MASCOT processing.')
         mapgi = False
         ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for mfile in self.list['ResFiles']:
             bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile])
             bud.info['Name'] = mfile
             bud.readMascot()
             self.dict['Searches'][mfile] = bud.dict['Hits']
             protacclist = rje.sortKeys(bud.dict['Hits'])
             for protacc in protacclist:
                 if rje.matchExp('gi\|(\d+)',protacc): mapgi = True
             accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile))
             self.debug(accfile)
             open(accfile,'w').write(string.join(protacclist,'\n'))
             self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist)))
         ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         #if mapgi:
         #    mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI')
         #    open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n'))
         ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list)
         self.dict['Acc2Seq'] = seqlist.seqNameDic('Max')
         ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',')
         rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True)
         for mfile in rje.sortKeys(self.dict['Searches']):
             bud = self.dict['Searches'][mfile]
             for protacc in rje.sortKeys(bud)[0:]:
                 protname = bud[protacc]['prot_acc']
                 protdesc = bud[protacc]['prot_desc']
                 if rje.matchExp('gi\|(\d+)',protacc):
                     gi = rje.matchExp('gi\|(\d+)',protacc)[0]
                     try:
                         protname = self.dict['Acc2Seq'][gi].shortName()
                         protdesc = self.dict['Acc2Seq'][gi].info['Description']
                     except: protname = 'gi_UNK__%s' % gi
                 #x#print protname, protdesc, bud[protacc]
                 for pep in bud[protacc]['Peptides']:
                     data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname,
                             'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']}
                     rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data)
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
예제 #7
0
 def ddi(self):  ### Domain-domain interactions
     '''Domain-domain interactions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ddx = 0
         (dx, dtot) = (0.0, len(self.dict['DDI']))
         if not self.dict['DDI'] or not self.dict['Domain']: return
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['DDI']):
             self.progLog(
                 '\r#DDI',
                 'Screening domain-domain interactions: %.1f%%; %s removed'
                 % ((dx / dtot), rje.integerString(ddx)))
             dx += 100
             if dom not in self.dict['Domain']:
                 self.printLog('#DOM',
                               'No sequences with "%s" domains' % dom)
                 continue
             for ddi in self.dict['DDI'][dom]:
                 if ddi not in self.dict['Domain']: continue
                 for hub in self.dict['Domain'][dom]:
                     if hub not in self.dict['PPI']: continue
                     for spoke in self.dict['PPI'][hub][0:]:
                         if spoke in self.dict['Domain'][ddi]:
                             ddx += 1
                             self.dict['PPI'][hub].remove(spoke)
                 for hub in self.dict['Domain'][ddi]:
                     if hub not in self.dict['PPI']: continue
                     for spoke in self.dict['PPI'][hub][0:]:
                         if spoke in self.dict['Domain'][dom]:
                             ddx += 1
                             self.dict['PPI'][hub].remove(spoke)
         self.printLog(
             '\r#DDI',
             'Screening domain-domain interactions complete: %s removed.' %
             (rje.integerString(ddx)))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DDI',
                           'No %s interactions left after DDI removed' %
                           hub,
                           screen=False)
         self.printLog(
             '#PPX', '%s of %s PPI hubs remain after DDI removed' %
             (rje.integerString(len(
                 self.dict['PPI'])), rje.integerString(hx)))
     except:
         self.errorLog('Problem with SLiMPID.ddi()', quitchoice=True)
예제 #8
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['Basefile'].lower() in ['','none']: self.info['Basefile'] = ''
         elif self.info['Basefile'][-1] != '.': self.info['Basefile'] += '.'
         self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T'])
         self.list['PlotFT'] = string.split(string.join(self.list['PlotFT']).upper())
         if self.info['OccFile'].lower() not in ['','none']:
             self.info['Delimit'] = rje.delimitFromExt(filename=self.info['OccFile'])
             self.dict['OccData'] = {}
             occdata = rje.dataDict(self,self.info['OccFile'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'])
             for key in rje.sortKeys(occdata):
                 seq = occdata[key].pop('Seq')
                 if seq not in self.dict['OccData']: self.dict['OccData'][seq] = {}
                 dataset = occdata[key].pop('Dataset')
                 if dataset not in self.dict['OccData'][seq]: self.dict['OccData'][seq][dataset] = []
                 self.dict['OccData'][seq][dataset].append(occdata[key])
             self.printLog('#OCC','Loaded data for %s occurrences in %s sequences' % (rje.integerString(len(occdata)),rje.integerString(len(self.dict['OccData']))))
             self.obj['SeqList'].autoFilter(['GoodSeq=%s' % string.join(rje.sortKeys(self.dict['OccData']),',')])
         ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['PlotStat'] = string.split(string.join(self.list['PlotStat']).lower())
         if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc = rje_slimcalc.SLiMCalc(self.log,self.cmd_list)
         seqdict = self.obj['SeqList'].seqNameDic()
         for name in rje.sortKeys(seqdict):
             if self.opt['OccOnly'] and not name in self.dict['OccData']: continue
             seq = seqdict[name]
             sequence = seq.getSequence(gaps=False)
             seq.dict['PlotStat'] = {}
             if 'sa' in self.list['PlotStat']: seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(sequence,returnlist=True)
             if 'hyd' in self.list['PlotStat']: seq.dict['PlotStat']['Hydropathy'] = rje_seq.eisenbergHydropathy(sequence,returnlist=True)
             if 'dis' in self.list['PlotStat']: seq.dict['PlotStat']['Disorder'] = seq.disorder(returnlist=True)
             if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']:
                 slimcalc.relConListFromSeq(seq,slimcalc.stat['RelConWin'],store=True)
                 try:
                     seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons')
                     seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop('RelCons')
                 except: self.printLog('#CONS','No conservation stats for %s' % name)
             self.printLog('#STAT','PlotStats calculated for %s' % name)
             for stat in seq.dict['PlotStat']:
                 if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0: seq.dict['PlotStat'][stat] = self.plotWin(seq.dict['PlotStat'][stat])
                 seq.dict['PlotStat'][stat] = self.convertStat(seq.dict['PlotStat'][stat])
             self.printLog('#STAT','PlotStats converted for %s' % name)                
         ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             if name in self.dict['OccData']:
                 for dataset in self.dict['OccData'][name]:
                     ofile = '%s%s.%s.plot.txt' % (self.info['Basefile'],dataset,seq.info['AccNum'])
                     self.output(seq,ofile,self.dict['OccData'][name][dataset])
             else: self.output(seq,'%s%s.plot.txt' % (self.info['Basefile'],seq.info['AccNum']))
         return
     except: self.errorLog(rje_zen.Zen().wisdom())
예제 #9
0
def setupCustomScores(
    callobj, statlist=[], scorelist=[], scoredict={}
):  ### Sets up Custom Scores using existing statlist
    """
    Sets up Custom Scores using existing statlist.
    >> callobj:RJE_Object [None] = calling object for Error Messages etc.
    >> statlist:list of stats that are allowed for custom score. Generally column headers for output.
    >> scorelist:list of Custom Score Names in order they were read in (may use prev. scores)   
    >> scoredict:dictionary of Custom Scores: {Name:Formula}
    << (statlist,scorelist,scoredict):(list,list,dictionary) of acceptable Custom Scores ([Stats],[Names],{Name:Formula})
    """
    try:
        ### Setup Custom Scores ###
        if not scorelist:
            scorelist = rje.sortKeys(scoredict)
        for new in scorelist[0:]:  # self.dict['NewScore'] keys() in order they were read in
            if new in statlist:
                callobj.log.errorLog('Score "%s" exists: custom score cannot be made.' % (new), printerror=False)
                scorelist.remove(new)
                scoredict.pop(new)
                continue
            if not rje.formula(callobj, formula=scoredict[new], varlist=statlist[0:], check=True, calculate=False):
                callobj.log.errorLog('Custom score "%s" cannot be made.' % (new), printerror=False)
                scorelist.remove(new)
                scoredict.pop(new)
                continue
            statlist.append(new)
        return (statlist, scorelist, scoredict)  ### Returns same things given ###
    except:
        callobj.log.errorLog("Problem during rje_scoring.setupCustomScores()", quitchoice=True)
        return scoredict
예제 #10
0
 def loadFeatures(self,ftfile):  ### Loads features from given file
     '''Loads features from given file.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if ftfile in ['','none']: return
         if not os.path.exists(ftfile): return self.printLog('#ERR','Features file "%s" missing')
         delimit = rje.delimitFromExt(filename=ftfile)
         ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         headers = rje.readDelimit(open(ftfile,'r').readline(),delimit)
         mainkeys = [headers[0]]
         hmap = {}
         for h in headers: hmap[h.lower()] = h
         pos = ''    # Leader for start/end positions
         if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_'
         for h in ['feature','%sstart' % pos,'%send' % pos,'description']:
             if h not in hmap: return self.printLog('#ERR','No %s field detected in "%s" features file' % (h,ftfile))
             mainkeys.append(hmap[h])
         mainkeys.remove(hmap['description'])
         ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ftdata = rje.dataDict(self,ftfile,mainkeys,['description'],delimit,headers,lists=True)
         (mx,mtot,fx) = (0.0,len(ftdata),0)
         for mainkey in rje.sortKeys(ftdata):
             self.progLog('\r#FT','Loading features from %s: %.2f%%' % (ftfile,mx/mtot))
             mx += 100.0                                                                           
             (id,ft,start,end) = string.split(mainkey,delimit)
             if id == mainkeys[0]: continue
             if id not in self.dict['Features']: self.dict['Features'][id] = []
             for desc in ftdata[mainkey][hmap['description']]:
                 fx += 1
                 self.dict['Features'][id].append({'Type':ft,'Start':int(start),'End':int(end),'Desc':desc})
         self.printLog('\r#FT','Loaded %s features for %s IDs from %s' % (rje.integerString(fx),rje.integerString(len(self.dict['Features'])),ftfile))
     except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
예제 #11
0
 def output(self,seq,outfile,occdata=[]):   ### Output to file
     '''Output to file.'''
     try:### ~ [1] ~ Basic Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.opt['OccOnly'] and not occdata: return
         odata = ['Name\t%s' % (seq.shortName()),
                  'Sequence\t%s' % (seq.getSequence(gaps=False)),
                  'Output\t%s' % (string.join(string.split(outfile,'.')[:-1],'.')),
                  'RE\t%s' % (string.join(self.list['PlotRE'],',')),
                  'TrueELMs\tY',
                  'Description\t%s' % (seq.info['Description']),
                  '',]
         ### ~ [2] ~ PlotStats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for plot in rje.sortKeys(seq.dict['PlotStat']): odata.append('Plot\t%s\t%s' % (plot,string.join(seq.dict['PlotStat'][plot],', ')))
         if seq.dict['PlotStat']: odata.append('')
         ### ~ [3] ~ PlotFT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if seq.obj['Entry']:
             for ft in seq.obj['Entry'].list['Feature']:
                 if ft['Type'] in self.list['PlotFT']: odata.append('Region\t%s %s\t%s:%s' % (ft['Type'],ft['Desc'],ft['Start'],ft['End']))
             odata.append('')
         ### ~ [4] ~ MotifOcc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if occdata:
             for occ in occdata: odata.append('Motif\t%s\t%s:%s' % (occ['Pattern'],occ['Start_Pos'],occ['End_Pos']))
         ### ~ [5] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         open(outfile,'w').write(string.join(odata,'\n'))
         self.printLog('#PLOT','SeqPlot output saved as %s' % (outfile))
     except: self.errorLog(rje_zen.Zen().wisdom())
예제 #12
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['DB'] = rje_db.Database(self.log,self.cmd_list+['tuplekeys=T'])
         if self.baseFile().lower() in ['','none']: self.baseFile('%s.vs.%s.Q%d' % (rje.baseFile(self.getStr('MutPileup'),True),rje.baseFile(self.getStr('WTPileup'),True),self.getInt('QCut')))
         if not self.force() and os.path.exists('%s.fdr.tdt' % self.baseFile()): return
         ### ~ [2] Look for/process WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.force() or not os.path.exists('%s.WT.tdt' % self.baseFile()): self.parsePileup('WT',self.getStr('WTPileup'))
         ### ~ [3] Generate Reference sequences and Major Alleles (by locus) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         refseq = {}; rx = 0
         majors = {}
         locus = None
         WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 0
         for line in WTDATA:
             self.progLog('\r#WT','Reading WT data: Reference seq length = %s nt' % (rje.iStr(rx)),rand=0.01)
             data = rje.readDelimit(line); wx += 1
             if data[0] == 'Locus': continue
             else:
                 if data[0] != locus: locus = data[0]; refseq[locus] = ''; majors[locus] = []
                 pos = int(data[1])
                 while (pos - 1) > len(refseq[locus]): refseq[locus] += '?'; rx += 1
                 while (pos - 1) > len(majors[locus]): majors[locus].append('-')
                 refseq[locus] += data[2]; majors[locus].append(data[5]); rx += len(data[2])
         WTDATA.close()
         self.printLog('\r#WT','%s lines read from WT data: Reference seq length = %s nt' % (rje.iStr(wx),rje.iStr(rx)))
         for locus in rje.sortKeys(majors):
             if len(majors[locus]) != len(refseq[locus]): self.errorLog('%s WTMajor versus RefSeq length mismatch!' % locus,printerror=False); raise ValueError
         self.dict['WTMajor'] = majors
         self.dict['RefSeq'] = refseq
         ### ~ [3] Look for/process Mutant Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.force() or not os.path.exists('%s.Mut.tdt' % self.baseFile()): self.parsePileup('Mut',self.getStr('MutPileup'),True)
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
예제 #13
0
 def ppi(self):  ### Remaining protein-protein interactions
     '''Remaining protein-protein interactions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['PPI']: return
         outdir = 'SLiMPID_PPI'
         rje.mkDir(self, outdir)
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for hub in rje.sortKeys(self.dict['PPI']):
             gene = self.dict['Gene'][hub]
             acc = []
             for name in self.dict['PPI'][hub]:
                 if not name: continue
                 if name in self.dict['Seq']:
                     acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname:
                     badname.append(name)
             open('%s/%s.ppi.acc' % (outdir, gene),
                  'w').write(string.join(acc, '\n'))
             self.printLog(
                 '#PPI',
                 '%s => %d individual interactors' % (gene, len(acc)))
         if badname:
             badname.sort()
             self.printLog(
                 '#BAD', '%d "bad" protein names: %s' %
                 (len(badname), string.join(badname, '; ')))
     except:
         self.errorLog('Problem with SLiMPID.setup()', quitchoice=True)
예제 #14
0
 def processGenes(self,genelist):  ### Tries to extract data for genes in genelist
     '''Tries to extract data for genes in genelist.'''
     ### ~ [1] Parse data from GeneCards (or existing data) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###    
     self.deBug(self.list['Genes'])
     (gx,fx) = (0,0)
     try:
         for gene in genelist:
             if self.parseCard(gene): gx += 1
             else: fx += 1
             self.log.printLog('\r#CARD','Parsing GeneCards for %d genes: %d parsed; %d failed.' % (len(genelist),gx,fx),newline=False,log=False)
         self.log.printLog('\r#CARD','Parsing GeneCards for %d genes complete: %d parsed; %d failed.' % (len(genelist),gx,fx))
     except KeyboardInterrupt: self.log.printLog('\r#CARD','Parsing GeneCards for %d genes stopped: %d parsed; %d failed.' % (len(genelist),gx,fx))
     except: raise
     ### ~ [2] Tidy for mixed success ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     (gx,gtot,cx) = (0.0,len(self.dict['GeneCard']),0)
     for alias in rje.sortKeys(self.dict['GeneCard']):
         self.log.printLog('\r#CARD','Checking and correcting partial successes: %.1f%%' % (gx/gtot),newline=False,log=False)
         gx += 100.0
         if 'HPRD' in self.dict['GeneCard'][alias] and self.dict['GeneCard'][alias]['HPRD'] == alias:
             newalias = 'HPRD' + alias
             self.dict['GeneCard'][newalias] = self.dict['GeneCard'].pop(alias)
             alias = newalias
         try: symbol = self.dict['GeneCard'][alias]['Symbol']
         except:
             #x#print 'F**k >> ', alias, self.dict['GeneCard'][alias], '<< F**k!!'
             self.log.errorLog('Problem with alias "%s"' % alias)
             continue
         if symbol in self.dict['GeneCard'] and self.dict['GeneCard'][symbol]['Symbol'] == '!FAILED!':
             self.dict['GeneCard'][symbol] = self.dict['GeneCard'][alias]
             cx += 1
     self.log.printLog('\r#CARD','Checking and correcting partial successes: %d entries corrected.' % (cx))
예제 #15
0
 def makePPIDatasets(self):  ### Generate PPI datasets from pairwise data
     '''Generate PPI datasets from pairwise data.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.mkDir(self, 'YeastPPI/')
         seqdict = self.dict['SeqDict']
         ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (hx, htot, fx) = (0.0, len(self.dict['PPI']), 0)
         for hub in rje.sortKeys(self.dict['PPI']):
             self.progLog(
                 '\r#FAS', 'Generating %s PPI fasta files: %.2f' %
                 (rje.integerString(fx), hx / htot))
             hx += 100.0
             if len(self.dict['PPI'][hub]) < 3: continue
             seqs = []
             for spoke in self.dict['PPI'][hub]:
                 if spoke not in seqdict: continue
                 seqs.append(seqdict[spoke])
             if len(seqs) < 3: continue
             self.obj['SeqList'].saveFasta(seqs,
                                           rje.makePath('YeastPPI/%s.fas' %
                                                        hub,
                                                        wholepath=True),
                                           log=False)
             fx += 1
         self.printLog(
             '\r#FAS',
             'Generation of %s PPI fasta files from %s hubs complete.' %
             (rje.integerString(fx), rje.integerString(htot)))
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
예제 #16
0
 def loadHHPID(self):  ### Load HHPID interactions
     '''Load HHPID interactions.'''
     try:  ### ~ [1] Setup HHPID Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getStr('HHPID').lower() in ['', 'none']:
             return self.printLog('#HHPID', 'No HHPID file to load')
         hdb = self.db().addTable(self.getStr('HHPID'),
                                  mainkeys='auto',
                                  datakeys='All',
                                  name='HHPID')
         for field in [
                 '#Tax ID 1', 'Tax ID 2', 'product accession.version 2',
                 'last update timestamp'
         ]:
             hdb.dropField(field)
         hdb.renameField('Gene ID 1', 'EntrezHIV')
         hdb.renameField('product accession.version 1', 'AccHIV')
         hdb.renameField('product name 1', 'HIV')
         hdb.renameField('Interaction short phrase', 'Interaction')
         hdb.renameField('Gene ID 2', 'Entrez')
         hdb.renameField('product name 2', 'Description')
         hdb.renameField('PubMed ID (PMID) list', 'PMID')
         for itype in rje.sortKeys(hdb.index('Interaction')):
             self.printLog(
                 '#HHPID', '%s => %s entries' %
                 (itype, len(hdb.index('Interaction')[itype])))
         hdb.dropEntriesDirect(
             'Interaction', ['binds', 'complexes with', 'interacts with'],
             inverse=True)
         return True
     except:
         self.errorLog('%s.loadHHPID error' % self)
예제 #17
0
 def save(self):  ### Saves parsed REST output to files
     '''Saves parsed REST output to files.'''
     rbase = '%s%s' % (self.getStr('RestOutDir'),
                       rje.baseFile(self.getStr('RestBase'),
                                    strip_path=True,
                                    keepext=True))
     rje.mkDir(self, self.getStr('RestOutDir'))
     outputs = rje.sortKeys(self.dict['Output'])
     if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')]
     elif self.getStrLC('Rest') in ['full', 'text']:
         outfile = '%s.rest' % rbase
         open(outfile, 'w').write(self.restFullOutput())
         self.printLog('#OUT', '%s: %s' % (self.getStrLC('Rest'), outfile))
         return True
     elif self.getStrLC('Rest'):
         self.printLog(
             '#OUTFMT', 'REST output format "%s" not recognised.' %
             self.getStrLC('Rest'))
         if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'):
             return False
         outfile = '%s.rest' % rbase
         open(outfile, 'w').write(self.restFullOutput())
         self.printLog('#OUT', 'full: %s' % (outfile))
         return True
     for rkey in outputs:
         if rkey in self.dict['Outfile']:
             rje.backup(self, self.dict['Outfile'][rkey])
             open(self.dict['Outfile'][rkey],
                  'w').write(self.dict['Output'][rkey])
             self.printLog('#OUT',
                           '%s: %s' % (rkey, self.dict['Outfile'][rkey]))
         elif rkey not in ['intro']:
             self.warnLog('No outfile parsed/generated for %s output' %
                          rkey)
예제 #18
0
def setupCustomScores(callobj,statlist=[],scorelist=[],scoredict={}):   ### Sets up Custom Scores using existing statlist
    '''
    Sets up Custom Scores using existing statlist.
    >> callobj:RJE_Object [None] = calling object for Error Messages etc.
    >> statlist:list of stats that are allowed for custom score. Generally column headers for output.
    >> scorelist:list of Custom Score Names in order they were read in (may use prev. scores)   
    >> scoredict:dictionary of Custom Scores: {Name:Formula}
    << (statlist,scorelist,scoredict):(list,list,dictionary) of acceptable Custom Scores ([Stats],[Names],{Name:Formula})
    '''
    try:
        ### Setup Custom Scores ###
        if not scorelist:
            scorelist = rje.sortKeys(scoredict)
        for new in scorelist[0:]:   # self.dict['NewScore'] keys() in order they were read in
            if new in statlist:
                callobj.log.errorLog('Score "%s" exists: custom score cannot be made.' % (new),printerror=False)
                scorelist.remove(new)
                scoredict.pop(new)
                continue
            if not rje.formula(callobj,formula=scoredict[new],varlist=statlist[0:],check=True,calculate=False):
                callobj.log.errorLog('Custom score "%s" cannot be made.' % (new),printerror=False)
                scorelist.remove(new)
                scoredict.pop(new)
                continue
            statlist.append(new)
        return (statlist,scorelist,scoredict)   ### Returns same things given ###
    except:
        callobj.log.errorLog('Problem during rje_scoring.setupCustomScores()',quitchoice=True)
        return scoredict
예제 #19
0
    def complexFasta(
            self):  ### Outputs parsed complex datasets in Fasta format
        '''Outputs parsed complex datasets in Fasta format.'''
        try:
            ### Setup ###
            datpath = self.info['OutDir'] + rje.makePath('HPRD_Complexes/')
            rje.mkDir(self, datpath)

            ### Output PPI Datasets ###
            for complex in rje.sortKeys(self.dict['Complex']):
                mylist = []
                for p2 in self.dict['Complex'][complex]:
                    if self.opt['AllIso']:
                        mylist += self.dict['HPRD'][p2]['Seq']
                    else:
                        mylist.append(self.dict['HPRD'][p2]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath, complex)
                if mylist:
                    self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile)
            self.log.printLog('#FAS', 'HPRD complex fasta output complete.')
        except:
            self.log.errorLog('Error in HPRD.complexFasta()',
                              printerror=True,
                              quitchoice=False)
            raise
예제 #20
0
 def codonUsageEntropyBias(self):   ### Calculate bias in Codon Usage using Entropy-based measure
     '''Calculate bias in Codon Usage using Entropy-based measure.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         aacode = self.db('Code').index('AA')
         nt = ['C','A','G','U']; codons = rje.sortKeys(rje_sequence.genetic_code)
         cdb = self.db('Codons'); edb = self.db('Expected')
         ## ~ [1a] Setup bias table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         bdb = self.db().addEmptyTable('Bias',['Seq','Len','Bias','ExpBias','WtBias','ExpWtBias'],['Seq'])
         ### ~ [2] Calculate Frequencies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         x = 0.0; etot = cdb.entryNum()
         for codentry in cdb.entries():
             self.progLog('\r#BIAS','Calculating Bias: %.2f%%' % (x/etot)); x += 100.0
             expentry = edb.data(codentry['Seq'])
             entry = {'Seq':codentry['Seq'],'Len':codentry['Len'],'Bias':0.0,'ExpBias':0.0,'WtBias':0.0,'ExpWtBias':0.0}
             aafreq = {}
             for aa in aacode:
                 aafreq[aa] = 0.0
                 for code in aacode[aa]: aafreq[aa] += codentry[code]
             rje.dictFreq(aafreq,total=False)
             for aa in aacode:
                 entry['Bias'] += rje.entropyDict(codentry,aacode[aa])
                 entry['ExpBias'] += rje.entropyDict(expentry,aacode[aa])
                 entry['WtBias'] += (aafreq[aa] * rje.entropyDict(codentry,aacode[aa]))
                 entry['ExpWtBias'] += (aafreq[aa] * rje.entropyDict(expentry,aacode[aa]))
             bdb.addEntry(entry)
         self.printLog('\r#BIAS','Codon Usage entropy bias calculated for %s entries' % rje.iStr(etot))
         bdb.saveToFile()
     except: self.errorLog('%s.expectedCodonUsage error' % self)
예제 #21
0
 def xmerProb(self,xmer,prefix=False):   ### Returns SCAP probability for count for given xmer from markov.dict tree
     '''
     Returns count for given xmer from self.dict tree.
     >> xmer:str = Xmer of interest
     >> prefix:bool [False] = Use Prefix tree rather than suffix tree
     '''
     ### ~ [1] ~ Choose tree ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     markov = self.obj['Markov']
     if markov.opt['Sorted'] and len(xmer) < markov.stat['MaxXmer']: return 1.0
     if prefix: _sufdic = markov.pretree(); xmer = rje.strReverse(xmer)
     else: _sufdic = markov.suftree()
     if markov.opt['Sorted']: xmer = rje.strSort(xmer[:-1]) + xmer[-1]
     prex = 0
     self.deBug('%s :: %s' % (xmer,rje.sortKeys(_sufdic)))
     ### ~ [2] ~ Find subtree ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     for x in range(len(xmer)):
         if xmer[x] in _sufdic.keys():
             if not _sufdic.has_key('e'):
                 _sufdic['e'] = 0.0
                 for a in markov.list['Alphabet']:
                     if _sufdic.has_key(a):
                         fa = _sufdic[a]['='] / float(_sufdic['='])
                         _sufdic['e'] += (fa * fa)
             prex = _sufdic['='] * _sufdic['e']; _sufdic = _sufdic[xmer[x]]
         elif xmer[x] not in markov.list['Alphabet'] or not prex: return 1.0
         else: return 0.5 / (0.05 * prex)    # Arbitrary small number!
         self.deBug('%s %d [%s] :: %d = %s' % (xmer,x,xmer[x],prex,_sufdic))
     ### ~ [3] ~ Calculate SCAP value ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     if prex: return _sufdic['='] / prex
     else: return 1.0
예제 #22
0
 def processHHPID(self):  ### Process HHPID interactions
     '''Process HHPID interactions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if rje.checkForFile('%s.HHPIDMap.tdt' % self.basefile()):
             mdb = self.db().addTable('%s.HHPIDMap.tdt' % self.basefile(),
                                      ['HIV', 'Gene'],
                                      'All',
                                      name='HHPIDMap')
             return mdb
         hdb = self.db('HHPID')
         gdb = self.db('GeneMap')
         pdb = self.db('PPI')
         mdb = self.db().joinTables(name='HHPIDMap',
                                    join=[(hdb, 'Entrez'), (gdb, 'Entrez')],
                                    newkey=['#'],
                                    empties=False,
                                    keeptable=True)
         for field in mdb.fields()[0:]:
             if field not in [
                     '#', 'AccHIV', 'EntrezHIV', 'HIV', 'Entrez', 'Gene',
                     'Symbol', 'UniProt', 'EnsEMBL', 'EnsLoci'
             ]:
                 mdb.dropField(field)
         mdb.compress(['HIV', 'Gene'], default='str')
         mdb.dropField('#')
         mdb.saveToFile()
         ### ~ [2] Save viral accession numbers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         open('%s.hivacc' % self.getStr('Basefile'), 'w').write(
             '%s\n' % string.join(rje.sortKeys(mdb.index('AccHIV')), '\n'))
         return mdb
     except:
         self.errorLog('%s.processHHPID error' % self)
         return False
예제 #23
0
 def runJobs(
     self
 ):  ### Runs all the jobs in self.list['SubJobs']                                               #V1.0
     '''Runs all the jobs in self.list['SubJobs'].'''
     ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     for j in range(self.getInt('KeepFree'), self.nprocs()):
         self.nextJob(j)  # Skip first node(s)
     pidcheck = '%s.pid' % rje.baseFile(self.log.info['LogFile'])
     ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     while self.dict['Running']:
         PIDCHECK = open(pidcheck, 'w')
         for j in rje.sortKeys(self.dict['Running']):
             if not self.dict['Running'][j]:
                 self.dict['Running'].pop(j)
                 continue  # No more jobs
             try:
                 pid = self.dict['Running'][j]['PID']
                 PIDCHECK.write('%s: %s\n' % (j, pid))
                 if string.split('%s' % pid)[0] == 'WAIT': status = 1
                 else: (status, exit_stat) = os.waitpid(pid, os.WNOHANG)
             except:
                 status = 1
             if status > 0:
                 self.endJob(
                     j
                 )  # subjob on processor j has finished: can replace with processing
         PIDCHECK.close()
         time.sleep(self.getInt('SubSleep'))
예제 #24
0
 def outputCards(self):  ### Outputs cards to delimited file
     '''Outputs cards to delimited file.'''
     ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     genelist = self.list['Genes']
     if self.opt['Purify'] and self.opt['Restrict']:
         for gene in genelist[0:]:
             if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']:  # Replace with symbol
                 genelist.remove(gene)
                 if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol'])
     delimit = rje.delimitFromExt(filename=self.info['CardOut'])
     CARDOUT = open(self.info['CardOut'],'a')
     ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     (noens,noloci,ox) = (0,0,0)
     for gene in rje.sortKeys(self.dict['GeneCard']):
         if self.opt['Restrict'] and gene not in genelist: continue
         elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue
         self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1
         self.dict['GeneCard'][gene]['Alias'] = gene
         self.dict['GeneCard'][gene]['Species'] = self.info['Species']
         rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene])
         if self.dict['GeneCard'][gene]['Symbol'] == gene:   # Not an alias
             if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1
             if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1
     CARDOUT.close()
     self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut']))
     self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))
예제 #25
0
 def addLinks(self,nested): ### Adds href aname links to definitions.
     '''Adds href aname links to definitions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         endstrip = [')','.',',',':',';','!']
         if self.getBool('Plurals'): endstrip.append('s')
         for term in rje.sortKeys(nested):
             if term == '=':
                 linkdef = []
                 rawdef = string.split(string.replace(nested['='],'(','( '))
                 while rawdef:
                     glossary = self.dict['Glossary']
                     if self.getBool('HRef') and rje.matchExp('<(\S+)>',rawdef[0]):
                         safetynet = rawdef[0:]
                         url = rje.matchExp('<(\S+)>',rawdef[0])[0]
                         if rje.matchExp('<(\S+)>\[(\S+)',rawdef[0]): rawdef[0] = '[%s' % rje.matchExp('<(\S+)>\[(\S+)',rawdef[0])[1]
                         elif rje.matchExp('<(\S+)>(\S+)',rawdef[0]): rawdef[0] = '[%s]%s' % (url,rje.matchExp('<(\S+)>(\S+)',rawdef[0])[1])
                         else: rawdef[0] = '[%s]' % url
                         try:
                             while ']' not in rawdef[0]: rawdef[0] = '%s %s' % (rawdef[0],rawdef.pop(1))
                             (linktext,linkextra) = rje.matchExp('\[(.+)\](\S*)',rawdef.pop(0))
                             if url[:3] not in ['htt','ftp']: url = 'http://%s' % url
                             linkdef.append('<a href="%s">%s</a>%s' % (url,linktext,linkextra))
                             continue
                         except:
                             self.errorLog('Problem parsing URL from "%s"' % nested['='])
                             rawdef = safetynet
                     if rawdef[0].lower() not in glossary:
                         if rawdef[0].lower()[:-1] not in glossary or rawdef[0].lower()[-1] not in endstrip:
                             linkdef.append(rawdef.pop(0)); continue
                     akey = []; alink = []
                     while rawdef and (rawdef[0].lower() in glossary or rawdef[0].lower()[:-1] in glossary):
                         if rawdef[0].lower() in glossary and '=' in glossary[rawdef[0].lower()]: rterm = rawdef[0].lower()
                         elif len(rawdef) > 1 and rawdef[0].lower() in glossary and (rawdef[1].lower() in glossary[rawdef[0].lower()] or rawdef[1].lower()[:-1] in glossary[rawdef[0].lower()]): rterm = rawdef[0].lower()
                         elif rawdef[0].lower()[-1] in endstrip and rawdef[0].lower()[:-1] in glossary: rterm = rawdef[0].lower()[:-1]
                         elif rawdef[0].lower() in glossary: rterm = rawdef[0].lower()
                         else: break
                         glossary = glossary[rterm]
                         akey.append(rterm)
                         alink.append(rawdef.pop(0))
                     akey = string.join(akey,'_')
                     if '=' in glossary:
                         alink = string.join(alink)
                         if nested == glossary: linkdef.append(alink)
                         elif self.getStr('HTMLStyle') != 'tab':
                             if alink[-1] in endstrip and alink[-1] != 's': linkdef.append('<a href="#%s">%s</a>%s' % (akey,alink[:-1],alink[-1]))
                             else: linkdef.append('<a href="#%s">%s</a>' % (akey,alink))
                         else:
                             if alink[-1] in endstrip and alink[-1] != 's': linkdef.append('<scaps>%s</scaps>%s' % (alink[:-1],alink[-1]))
                             else: linkdef.append('<scaps>%s</scaps>' % (alink))
                     else:
                         linkdef.append(alink[0])
                         rawdef = alink[1:] + rawdef
                 nested['+'] = string.replace(string.join(linkdef),'( ','(')
                 while rje.matchExp(' _([^_]+)_',nested['+']):
                     italics = rje.matchExp(' _([^_]+)_',nested['+'])[0]
                     nested['+'] = string.replace(nested['+'],' _%s_' % italics,' <i>%s</i>' % italics)
                 #self.deBug(nested)
             elif term != '+': self.addLinks(nested[term])
     except: self.errorLog('%s.addLinks error' % self)
예제 #26
0
    def saveMutations(self):    ### Outputs parsed mutations into a delimited file
        '''Outputs parsed mutations into a delimited file.'''
        try:### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease']
            outfile = 'omim_mutations.tdt'
            rje.delimitedFileOutput(self,outfile,headers,'\t',rje_backup=True)

            ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for gene in rje.sortKeys(self.dict['Mutations']):
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):
                    (disease,mutation) = self.dict['Mutations'][gene][subid]
                    (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation)
                    datadict = {'OMIM_ID':string.join(self.dict['Records'][gene],'; '),'SubID':subid,'Gene':gene,
                                'Pos':pos,'WildAA':wild,'MutAA':mut,'Disease':disease}
                    rje.delimitedFileOutput(self,outfile,headers,'\t',datadict)
            self.log.printLog('#OUT','OMIM Mutation output to %s complete' % outfile)
        except: self.log.errorLog(rje_zen.Zen().wisdom())
예제 #27
0
 def fpi(self):  ### Family-protein interactions
     '''Family-protein interactions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_FPI'
         rje.mkDir(self,outdir)
         fpi = {}            # Dictionary of {family:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for qry in rje.sortKeys(self.dict['PPI']):
             try:
                 fam = self.dict['Fam'][qry]
                 if len(fam) < 2: continue
             except: self.errorLog('Problem with "%s" protein family' % qry); continue
             fpi[qry] = []
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 fpi[qry] += self.dict['PPI'][hub]      # Add with redundancy
             for spoke in fpi[qry][0:]:
                 if fpi[qry].count(spoke) == 1: fpi[qry].remove(spoke)   # Must have 2+ family interactions
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in fpi[qry]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub)
             fpi[qry] = rje.sortUnique(fpi[qry],False,False)
             acc = []
             gene = self.dict['Gene'][qry]
             for name in fpi[qry]:
                 if not name: continue
                 if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname: badname.append(name)                     
             open('%s/%s.fpi.acc' % (outdir,gene),'w').write(string.join(acc,'\n'))
             self.printLog('#FPI','%s family => %d interactors' % (gene,len(acc)))
         if badname:
             badname.sort()
             self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#FPI','No %s PPI left after FPI removed' % hub)
         self.printLog('#PPX','%s of %s PPI hubs remain after FPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx)))
     except: self.errorLog('Problem with SLiMPID.fpi()',quitchoice=True)
예제 #28
0
 def domainFasta(self):    ### Outputs parsed domain and domain PPI datasets in Fasta format
     '''Outputs parsed PPI datasets in Fasta format.'''
     try:
         ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         headers = ['Domain','HPRD','Gene']
         dfile = self.info['OutDir'] + 'HPRD.domains.tdt'
         rje.delimitedFileOutput(self,dfile,headers,'\t')
         sfile = self.info['OutDir'] + 'HPRD.domsource.tdt'
         shead = ['Domain','Source']
         rje.delimitedFileOutput(self,sfile,shead,'\t')
         dx = 0.0
         for domain in rje.sortKeys(self.dict['Domains']):
             self.log.printLog('\r#DOM','HPRD Domain output (%s): %.1f%%' % (dfile,dx/len(self.dict['Domains'])),newline=False,log=False)
             dx += 100.0
             for hid in self.dict['Domains'][domain]:
                 datadict = {'Domain':domain,'HPRD':hid,'Gene':self.dict['HPRD'][hid]['gene']}
                 rje.delimitedFileOutput(self,dfile,headers,'\t',datadict)
             for source in self.dict['DomainSource'][domain]:
                 datadict = {'Domain':domain,'Source':source}
                 rje.delimitedFileOutput(self,sfile,shead,'\t',datadict)
         self.log.printLog('\r#DOM','HPRD Domain output (%s): %s domains.' % (dfile,rje.integerString(len(self.dict['Domains']))))
                    
         ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         datpath = self.info['OutDir'] + rje.makePath('HPRD_Domain_Datasets/')
         rje.mkDir(self,datpath)
         for domain in rje.sortKeys(self.dict['Domains']):
             ## Generate a list of all interactors with domain-containing proteins ##
             plist = []
             for p1 in self.dict['Domains'][domain]:
                 if p1 not in self.dict['PPI']: continue
                 for p2 in self.dict['PPI'][p1]:
                     if p2 not in plist: plist.append(p2)
             plist.sort()
             ## Generate Sequence list and output ##
             mylist = []
             for p in plist:
                 if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq']
                 else: mylist.append(self.dict['HPRD'][p]['Seq'])
             sfile = '%s%s_hprd.fas' % (datpath,domain)
             if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile)
             else: self.log.printLog('#DOM','No PPI partners for domain "%s"' % domain)
         self.log.printLog('\r#DOM','HPRD Domain fasta output complete.')
     except:
         self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False)
         raise
예제 #29
0
 def loadFeatures(self, ftfile):  ### Loads features from given file
     '''Loads features from given file.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if ftfile in ['', 'none']: return
         if not os.path.exists(ftfile):
             return self.printLog('#ERR', 'Features file "%s" missing')
         delimit = rje.delimitFromExt(filename=ftfile)
         ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         headers = rje.readDelimit(open(ftfile, 'r').readline(), delimit)
         mainkeys = [headers[0]]
         hmap = {}
         for h in headers:
             hmap[h.lower()] = h
         pos = ''  # Leader for start/end positions
         if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_'
         for h in [
                 'feature',
                 '%sstart' % pos,
                 '%send' % pos, 'description'
         ]:
             if h not in hmap:
                 return self.printLog(
                     '#ERR', 'No %s field detected in "%s" features file' %
                     (h, ftfile))
             mainkeys.append(hmap[h])
         mainkeys.remove(hmap['description'])
         ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ftdata = rje.dataDict(self,
                               ftfile,
                               mainkeys, ['description'],
                               delimit,
                               headers,
                               lists=True)
         (mx, mtot, fx) = (0.0, len(ftdata), 0)
         for mainkey in rje.sortKeys(ftdata):
             self.progLog(
                 '\r#FT',
                 'Loading features from %s: %.2f%%' % (ftfile, mx / mtot))
             mx += 100.0
             (id, ft, start, end) = string.split(mainkey, delimit)
             if id == mainkeys[0]: continue
             if id not in self.dict['Features']:
                 self.dict['Features'][id] = []
             for desc in ftdata[mainkey][hmap['description']]:
                 fx += 1
                 self.dict['Features'][id].append({
                     'Type': ft,
                     'Start': int(start),
                     'End': int(end),
                     'Desc': desc
                 })
         self.printLog(
             '\r#FT', 'Loaded %s features for %s IDs from %s' %
             (rje.integerString(fx),
              rje.integerString(len(self.dict['Features'])), ftfile))
     except:
         self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
예제 #30
0
 def makeChildren(self): ### Goes through GO dictionary and adds 'child_terms' to dictionary
     '''Goes through GO dictionary and adds 'child_terms' to dictionary.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         go = self.go()
         for id in rje.sortKeys(go): go[id]['child_terms'] = []
         (gx,gtot) = (0.0,len(go)*2)
         ### ~ [2] ~ Add children ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for id in rje.sortKeys(go):
             self.progLog('#GO','Making GO children: %.2f%%' % (gx/gtot)); gx += 100.0
             for term in self.list['ParentTerms']:
                 if term in go[id]:
                     for parent in go[id][term]:
                         if parent not in go: self.errorLog('%s %s ID "%s" missing!' % (id,term,parent),printerror=False)
                         else: go[parent]['child_terms'].append(id)
         for id in rje.sortKeys(go):
             self.progLog('#GO','Making GO children: %.2f%%' % (gx/gtot)); gx += 100.0
             go[id]['child_terms'].sort()
         self.printLog('\r#GO','Making GO children complete.')
     except:
         self.errorLog('Major problem with GO.makeChildren()')
         raise
예제 #31
0
 def dpi(self):  ### Domain-protein interactions
     '''Domain-protein interactions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_DPI'
         rje.mkDir(self,outdir)
         dpi = {}            # Dictionary of {domain:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['Domain']):
             dpi[dom] = []
             for hub in self.dict['Domain'][dom]:
                 if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][hub]      # Add with redundancy
             for spoke in dpi[dom][0:]:
                 if dpi[dom].count(spoke) == 1: dpi[dom].remove(spoke)   # Must have 2+ domain interactions
             for hub in self.dict['Domain'][dom]:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in dpi[dom]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub)
             dpi[dom] = rje.sortUnique(dpi[dom],False,False)
             acc = []
             for name in dpi[dom]:
                 if not name: continue
                 if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname: badname.append(name) 
             open('%s/%s.dpi.acc' % (outdir,dom),'w').write(string.join(acc,'\n'))
             self.printLog('#DPI','%s domain => %d interactors' % (dom,len(acc)))
         if badname:
             badname.sort()
             self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DPI','No %s PPI left after DPI removed' % hub,screen=False)
         self.printLog('#PPX','%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx)))
     except: self.errorLog('Problem with SLiMPID.dpi()',quitchoice=True)
예제 #32
0
 def setup(self):    ### Sets up headers and reads in existing data if present
     '''Sets up headers and reads in existing data if present.'''
     try:
         ### ~ Setup Basic Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         #X#headers = ['Alias','Species','Symbol','HGNC','Entrez','UniProt','EnsEMBL','HPRD','OMIM','EnsLoci','Desc']
         headers = ['Alias','Species'] + gc_headers  # All other headers added from altsource list
         ### ~ Read in data from existing files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.readHGNC()
         if self.opt['Update'] and os.path.exists(self.info['CardOut']): self.list['AltSource'].append(self.info['CardOut'])
         for altsource in self.list['AltSource']:
             sourcefile = rje.makePath(altsource,True)
             if not os.path.exists(sourcefile):
                 self.log.errorLog('Alternative source "%s" missing!' % sourcefile,printerror=False,quitchoice=True)
                 continue
             update = rje.dataDict(self,sourcefile,getheaders=True,ignore=['#'])
             for h in update.pop('Headers'):
                 if h not in headers:
                     headers.append(h)
             self.log.printLog('#DATA','Read GeneCards data for %d genes.' % (len(update)))
             for gene in rje.sortKeys(update):     # Each source will overwrite data from the file before
                 ## ~ Convert to Upper Case for consistency ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if gene != gene.upper() and gene.upper() in update: continue    # Only use upper case one!
                 elif gene != gene.upper():
                     update[gene.upper()] = update.pop(gene)
                     gene = gene.upper()
                 if gene == '!FAILED!': continue
                 ## ~ Update main dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['Update'] and altsource == self.info['CardOut'] and gene not in self.list['Genes']: self.list['Genes'].append(gene)
                 if gene in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][gene],update[gene])
                 else: self.dict['GeneCard'][gene] = update[gene]
                 ## ~ Temp Debugging ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if gene in self.list['TestGenes']:
                     print gene
                     print update[gene]
                     self.deBug(self.dict['GeneCard'][gene])
                 ## ~ Check Aliases etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'Symbol' in self.dict['GeneCard'][gene]: self.dict['GeneCard'][gene]['Symbol'] = self.dict['GeneCard'][gene]['Symbol'].upper()
                 if 'Symbol' in update[gene] and update[gene]['Symbol'] != '!FAILED!':
                     symbol = update[gene]['Symbol']
                     if symbol in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][symbol],update[gene],overwrite=False,replaceblanks=True)
                     else: self.dict['GeneCard'][symbol] = update[gene]
                 self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard'])),newline=False,log=False)
                 if len(string.split(gene)) > 1: print '!!!', gene, '!!!'
         ### ~ Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard'])))
         self.list['Headers'] = headers[0:]
         if self.opt['Update']: self.opt['Append'] = False
         #x#if 'TASP1' in self.dict['GeneCard']: self.deBug(self.dict['GeneCard']['TASP1'])
         #x#else: self.deBug(rje.sortKeys(self.dict['GeneCard']))
     except:
         self.log.errorLog('Problem during GeneCards.setup()')
         raise
예제 #33
0
 def setup(self,gtext=''):    ### Main class setup method. gtext will over-ride input file.
     '''Main class setup method. gtext will over-ride input file.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['HTML'] = rje_html.HTML(self.log,self.cmd_list)
         ## ~ [1a] File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('InFile')))
         if self.getStr('OutFile').lower() in ['','none']: self.str['OutFile'] = '%s.html' % self.basefile()
         ## ~ [1b] Read in Glossary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         interms = []
         if gtext:
             delimit = self.getStr('TermSplit')
             if delimit.lower() == 'tab': delimit = '\t'
             if delimit.lower() == 'space': delimit = ' '
             if delimit.lower() == 'comma': delimit = ','
             if delimit.lower() == 'period (.)': delimit = '.'
             if delimit.lower() == 'colon': delimit = ':'
             glossary = {}
             for line in string.split(gtext,'\n'):
                 splitline = string.split(line,delimit)
                 if delimit == '.' and (splitline[-1] in ['',' ']): splitline = splitline[:-1]
                 if not splitline: continue
                 (term,definition) = (splitline[0],string.join(splitline[1:],delimit))
                 if term == 'Term' and not glossary: continue
                 if term:
                     glossary[term] = {'Definition':definition}
                     interms.append(term)
         else: 
             try:
                 if not self.getBool('KeepOrder') and open(self.getStr('InFile'),'r').readline()[:4] == 'Term': 
                     glossary = rje.dataDict(self,self.getStr('InFile'),mainkeys=['Term'],datakeys=['Term','Definition'])
                 else: return self.setup(open(self.getStr('InFile'),'r').read())
             except: 
                 self.errorLog('Problem reading input as dataDict(). Will try as text.')
                 return self.setup(open(self.getStr('InFile'),'r').read())
         if self.list['Terms']:
             for term in glossary:
                 if term not in self.list['Terms']: glossary.pop(term)
         elif self.getBool('KeepOrder'): self.list['Terms'] = interms
         else: self.list['Terms'] = rje.sortKeys(glossary)
         for term in glossary: glossary[term] = glossary[term]['Definition']
         ### ~ [2] Create Full Glossary Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         nested = {}
         for term in glossary:
             tdict = nested
             for word in string.split(term.lower()):
                 if word not in tdict: tdict[word] = {}
                 tdict = tdict[word]
             tdict['='] = glossary[term]
         self.dict['Glossary'] = nested
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
예제 #34
0
 def sgd2sp(self):   ### Reformats yeast sequence names and outputs new data for GOPHER
     '''Reformats yeast sequence names and outputs new data for GOPHER.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         inseq = self.obj['SeqList']
         uni = rje_uniprot.UniProt(self.log,self.cmd_list+['datout=None'])
         xref = self.db('XRef')
         self.dict['Rename'] = {}
         ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         ufile = '%s.dat' % self.info['Basefile']
         if os.path.exists(ufile) and not self.opt['Force']: uni.readUniProt(ufile,clear=True,cleardata=False)
         else:
             uni.readUniProt(clear=True,acclist=rje.sortKeys(xref.index('UniProt')),cleardata=False)
             uni.saveUniProt(ufile)
         ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniseq = {}
         for entry in uni.entries():
             seq = entry.obj['Sequence']
             uniseq[seq.info['AccNum']] = seq
         self.printLog('\r#USEQ','%s UniProt Sequences extracted (%s Ensembl AccNum)' % (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt')))))
         ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         yseq = []       # List of YEAST sequence objects
         (sx,stot) = (0.0,inseq.seqNum())
         for seq in inseq.seqs():
             self.progLog('\r#SEQ','Reformatting sequence names: %.2f%%' % (sx/stot)); sx += 100.0
             if seq.info['SpecCode'] != 'YEAST': continue
             yseq.append(seq)
             sgd = seq.info['AccNum']; newname = seq.info['Name']
             try:
                 for x in xref.indexEntries('EnsG',sgd):
                     acc = x['UniProt']
                     if acc: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD'],acc)
                     else: newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % (seq.info['Name'],x['Gene'],x['EnsG'],x['SGD']); continue
                     if acc not in uniseq: self.printLog('\r#UNIERR','Unable to find UniProt sequence %s (%s)' % (acc,sgd)); continue
                     useq = uniseq[acc]
                     if useq.info['Sequence'] != seq.info['Sequence']: self.printLog('\r#SEQERR','%s sequence <> %s sequence' % (sgd,acc)); continue
                     nsplit = string.split(newname)
                     nsplit[0] = '%s__%s' % (x['UniprotID'],acc)
                     newname = string.join(nsplit)
                     self.dict['Rename'][sgd] = acc
                     break
             except: self.errorLog('%s problem' % sgd)
             seq.info['Name'] = newname
             seq.extractDetails(gnspacc=True)
         self.printLog('\r#SEQ','Reformatting sequence names complete.')
         ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not rje.exists('%s.ygob.fas' % self.info['Basefile']):
             inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile'])
         if not rje.exists('%s.yeast.fas' % self.info['Basefile']):
             inseq.saveFasta(seqs=yseq,seqfile='%s.yeast.fas' % self.info['Basefile'])
         self.list['YeastSeq'] = inseq.accList(yseq)
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
예제 #35
0
 def summaryScores(self,rankdb=None,sumstr='taxasum',minsum='MinSum'):   ### Generates summary scores from rank table.
     '''Generates summary scores from rank table.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         db = self.db()
         if not rankdb: rankdb = self.db('taxamap')
         sumdb = db.addEmptyTable(sumstr,['rank','taxon','count','bootwt','meanboot','perc','wtperc'],['rank','taxon'])
         ranks = ['genus','family','order','class','phylum']
         ### ~ [2] Normalise to reduced levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for rank in ranks:
             self.printLog('\r#RANK','Normalising %s data.' % rank)
             taxsum = {}; ranksum = 0.0  # Summed counts for taxa and rank total
             taxwt = {}; wtsum = 0.0     # Bootstrap-weighted summed counts for taxa and rank total
             bootsum = {}; bootx = {}    # Sum and count of bootstrap values for mean boot numbers
             for entry in rankdb.entries():
                 taxa = string.split(entry[rank],'|')
                 for taxon in taxa:
                     if taxon in self.list['TaxFilter']: continue
                     if taxon not in taxsum:
                         taxsum[taxon] = 0.0; taxwt[taxon] = 0.0
                         bootsum[taxon] = 0.0; bootx[taxon] = 0
                     taxsum[taxon] += 1.0 / len(taxa)
                     ranksum += 1.0 / len(taxa)
                     taxweight = entry['boot']
                     bootsum[taxon] += entry['boot']; bootx[taxon] += 1
                     taxwt[taxon] += taxweight / len(taxa)
                     wtsum += taxweight / len(taxa)
             otherx = 0
             for taxon in rje.sortKeys(taxsum):
                 if taxon == 'Other': continue
                 if taxsum[taxon] < self.getNum(minsum):
                     if 'Other' not in taxsum:
                         taxsum['Other'] = 0.0
                         taxwt['Other'] = 0.0
                         bootsum['Other'] = 0.0
                         bootx['Other'] = 0.0
                     taxsum['Other'] += taxsum.pop(taxon)
                     taxwt['Other'] += taxwt.pop(taxon)
                     bootsum['Other'] += bootsum.pop(taxon)
                     bootx['Other'] += bootx.pop(taxon)
                     otherx += 1
             self.printLog('#MINSUM','%s %s taxa converted to "Other" (count < minsum=%s)' % (rje.iStr(otherx),rank,self.getNum(minsum)))
             for taxon in taxsum: sumdb.addEntry({'rank':rank,'taxon':taxon,'count':rje.dp(taxsum[taxon],1),
                                                  'perc':rje.sf(100.0*taxsum[taxon]/ranksum),
                                                  'bootwt':rje.dp(taxwt[taxon],1),'meanboot':rje.dp(bootsum[taxon]/bootx[taxon],3),
                                                  'wtperc':rje.sf(100.0*taxwt[taxon]/wtsum)})
         ## ~ [2a] Rank taxa by counts such that highest is Rank 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         sumdb.rankFieldByIndex('rank','count',rev=True,absolute=True,lowest=True)
         sumdb.rankFieldByIndex('rank','bootwt',rev=True,absolute=True,lowest=True)
         ## ~ [2b] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         sumdb.saveToFile()
     except: self.errorLog('%s.summaryScores error' % self.prog())
예제 #36
0
 def pileUpFDR(self):  ### Calculates statistics of genetic differences from parsed PileUp Tables
     '''Calculates statistics of genetic differences from parsed PileUp Tables.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         fdrfile = '%s.fdr.tdt' % self.baseFile()
         if not self.force() and os.path.exists(fdrfile): return 
         sigpval = {}    # pval:[fpos]
         npos = 0; nx = 0
         for locus in rje.sortKeys(self.dict['RefSeq']):
             npos += len(self.dict['RefSeq'][locus]) - self.dict['RefSeq'][locus].count('?')
         ### ~ [1] Parse out stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r')
         headers = string.split(SAMSIG.readline()) + ['p.FDR']
         fpos = SAMSIG.tell(); fline = SAMSIG.readline(); px = 0
         while fline:
             self.progLog('\r#SIG','Reading Pvalues: %s p <= 0.05...' % rje.iStr(px))
             try: pval = float(string.split(fline)[-1])
             except: break
             if pval <= 0.05:
                 if pval not in sigpval: sigpval[pval] = []
                 sigpval[pval].append(fpos); px += 1
             fpos = SAMSIG.tell(); fline = SAMSIG.readline()
         self.printLog('\r#SIG','Reading Pvalues complete: %s p <= 0.05.' % rje.iStr(px))
         ### ~ [2] Calculate FDR and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         SAMFDR = open(fdrfile,'w')
         rje.writeDelimit(SAMFDR, headers)
         px = 0; sx = 0.0; stot = len(sigpval)
         for pval in rje.sortKeys(sigpval):
             self.progLog('\r#FDR','Calculating FDR: %.2f%%' % (sx/stot)); sx += 100.0
             px += len(sigpval[pval])
             if pval: fdr = (pval * npos) / px
             else: fdr = 0.0
             for fpos in sigpval[pval]:
                 SAMSIG.seek(fpos)
                 rje.writeDelimit(SAMFDR,rje.readDelimit(SAMSIG.readline())+[rje.expectString(fdr)])
         SAMSIG.close()
         SAMFDR.close()
         self.printLog('\r#FDR','%s FDR lines output to %s' % (rje.iStr(px),fdrfile))
     except: self.errorLog('%s.pileUpFDR() error' % (self)); return None
예제 #37
0
 def powerGO(self,numbers,sig=0.01,samples='all',total='Total',countkey='counts',ignore=[]):  ### Special GO power calculation for GO slim set
     '''
     Special GO power calculation for GO slim set.
     >> numbers:dictionary of {Sample:Count}
     >> sig:float [0.01] = Desired significance level to achieve. Currently uncorrected. Add Bonf/FDR with time.
     >> samples:str ['all'] = Whether sig must be achievable for 'any' or 'all' samples.
     >> total:str ['Total'] = Sample containing Total counts to compare against
     >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample
     - self.go(id)[countkey] = {Sample:count}
     >> ignore:list of Samples to ignore from calculation
     << returns a list of GO IDs that meet criteria
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         N = numbers[total]        # Total count for calculating expectations/probabilities
         nlist = []                  # List of counts for subsamples to be assessed
         for sample in numbers:
             if sample not in ignore + [total]: nlist.append(numbers[sample])
         nlist = rje.sortUnique(nlist,xreplace=False,num=True)
         ### ~ [2] ~ Generate Power Range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         plist = []                  # List of acceptable Total counts for subset
         nx = 0.0
         for i in range(1,N+1):      # Look at all possible levels of occurrence
             self.progLog('#POW','Calculating GO term power: %.1f%%' % (nx/N))
             nx += 100.0
             ok = 0
             p = float(i) / N        # Probability of each gene having this term
             for n in nlist:         # Look at each subset
                 k1 = min(i,n)       # Want to look at largest possible count for sample-term pairing
                 k2 = max(0,n-(N-i)) # Also want to look at the likelihood of under-representation
                 if rje.binomial(k1,n,p,callobj=self) <= sig: ok += 1
                 elif (1 - rje.binomial(k2+1,n,p,callobj=self)) <= sig: ok += 1
                 #!# Add under-representation too! #!#
                 if ok and samples == 'any': break
             if (ok and samples == 'any') or ok == len(nlist): plist.append(i)
         self.printLog('\r#POW','Calculation of GO term power complete.',log=False)
         self.deBug(nlist)
         ### ~ [3] ~ Generate GO Slim ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         terms = []
         (ix,itot) = (0.0,len(self.go()))
         for id in rje.sortKeys(self.go()):
             self.progLog('#POW','Assessing terms for power: %.1f%% (%s terms)' % (ix/itot,rje.iLen(terms)))
             ix += 100.0
             if self.go(id)[countkey][total] in plist: terms.append(id)
         self.printLog('\r#POW','Assessed terms for statistical power, p <= %s: %s GO terms' % (sig,rje.iLen(terms)))
         #!# Add correction terms #!#
         self.deBug(terms)
         return terms
     except: self.errorLog('Major problem with GO.powerGO()')
     return []
예제 #38
0
    def saveFasta(self):    ### Outputs parsed PPI datasets in Fasta format
        '''Outputs parsed PPI datasets in Fasta format.'''
        try:
            ### Setup ###
            datpath = self.info['OutDir'] + rje.makePath('HPRD_Datasets/')
            rje.mkDir(self,datpath)
            ## Check Seqs ##
            for p1 in rje.sortKeys(self.dict['PPI']):
                if 'Seq' not in self.dict['HPRD'][p1]:      #!# KeyError #!#
                    print p1, self.dict['HPRD'][p1]
                    self.deBug('No Seq for %s' % p1)

            ### All sequences ###
            self.obj['SeqList'].saveFasta()
            ### Output PPI Datasets ###
            for p1 in rje.sortKeys(self.dict['PPI']):
                mylist = []
                for p2 in self.dict['PPI'][p1]:
                    if self.opt['AllIso']: mylist += self.dict['HPRD'][p2]['Seq']
                    else: mylist.append(self.dict['HPRD'][p2]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath,self.dict['HPRD'][p1]['gene'])
                if mylist: self.obj['SeqList'].saveFasta(seqs=mylist,seqfile=sfile)
            self.log.printLog('#FAS','HPRD PPI fasta output complete.')
        except: self.log.errorLog('Error in HPRD.saveFasta()',printerror=True,quitchoice=False)
예제 #39
0
    def saveMutations(
            self):  ### Outputs parsed mutations into a delimited file
        '''Outputs parsed mutations into a delimited file.'''
        try:  ### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = [
                'OMIM_ID', 'SubID', 'Gene', 'Pos', 'WildAA', 'MutAA', 'Disease'
            ]
            outfile = 'omim_mutations.tdt'
            rje.delimitedFileOutput(self,
                                    outfile,
                                    headers,
                                    '\t',
                                    rje_backup=True)

            ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for gene in rje.sortKeys(self.dict['Mutations']):
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):
                    (disease, mutation) = self.dict['Mutations'][gene][subid]
                    (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',
                                                    mutation)
                    datadict = {
                        'OMIM_ID': string.join(self.dict['Records'][gene],
                                               '; '),
                        'SubID': subid,
                        'Gene': gene,
                        'Pos': pos,
                        'WildAA': wild,
                        'MutAA': mut,
                        'Disease': disease
                    }
                    rje.delimitedFileOutput(self, outfile, headers, '\t',
                                            datadict)
            self.log.printLog('#OUT',
                              'OMIM Mutation output to %s complete' % outfile)
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
예제 #40
0
 def test(self): ### Development method
     '''Development method.'''
     self.readGO()
     self.mapEnsGO()
     gohead = ['EnsG','GO_ID','GO_Type','GO_Desc']
     gofile = 'test.go.tdt'
     rje.delimitedFileOutput(self,gofile,gohead,rje_backup=True)
     gx = 0.0; gtot = len(self.dict['EnsGO'])
     for gene in rje.sortKeys(self.dict['EnsGO']):
         self.progLog('\r#ENSGO','Compiling %s: %.2f%%' % (gofile,gx/gtot)); gx += 100.0
         for goid in self.dict['EnsGO'][gene]:
             godata = {'EnsG':gene, 'GO_ID':goid}
             godata['GO_Type'] = self.dict['GO'][goid]['type']
             godata['GO_Desc'] = self.dict['GO'][goid]['name']
             rje.delimitedFileOutput(self,gofile,gohead,datadict=godata)
     self.printLog('\r#ENSGO','Compiling %s all done: %s genes.' % (gofile,rje.integerString(gtot)))
예제 #41
0
 def addEnsLoci(self):   ### Adds EnsLoci data to Gene Information
     '''Adds EnsLoci data to Gene Information.'''
     if not self.dict['EnsLoci']: return
     ex = 0
     for gene in rje.sortKeys(self.dict['GeneCard']):
         if not self.dict['GeneCard'][gene].has_key('EnsEMBL') or not self.dict['GeneCard'][gene]['EnsEMBL']:
             if self.dict['GeneCard'][gene].has_key('UniProt') and self.dict['GeneCard'][gene]['UniProt'] in self.dict['UniEns']:
                 self.dict['GeneCard'][gene]['EnsEMBL'] = self.dict['UniEns'][self.dict['GeneCard'][gene]['UniProt']]
         if self.dict['GeneCard'][gene].has_key('EnsEMBL') and self.dict['GeneCard'][gene]['EnsEMBL'] in self.dict['EnsLoci']:
             ex += 1
             self.dict['GeneCard'][gene]['EnsLoci'] = self.dict['EnsLoci'][self.dict['GeneCard'][gene]['EnsEMBL']]
             self.dict['GeneCard'][gene]['EnsDesc'] = self.dict['EnsDesc'][self.dict['GeneCard'][gene]['EnsEMBL']]
         # EnsEMBL genes might be missing as they might be pseudogenes etc.
         #x#elif self.dict['GeneCard'][gene].has_key('EnsEMBL'): self.log.errorLog('EnsEMBL Gene "%s" missing from EnsLoci dictionary!' % self.dict['GeneCard'][gene]['EnsEMBL'],printerror=False)
         self.log.printLog('\r#ENS','Adding EnsLoci data: %d of %d genes' % (ex,len(self.dict['GeneCard'])),newline=False,log=False)
     self.log.printLog('\r#ENS','Added EnsLoci data for %d of %d genes' % (ex,len(self.dict['GeneCard'])))
예제 #42
0
 def makeGOGenes(self,gokey='EnsGO'):  ### Makes a dictionary of {GO:[Genes]}
     '''Makes a dictionary of {GO:[Genes]}.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if gokey in ['name','is_a','part_of','type','child_terms']:
             self.errorLog('Cannot have "%s" as GOGenes key - reserved for GO' % gokey); raise ValueError
         if gokey not in self.dict:
             self.errorLog('"%s" mappings missing!' % gokey); raise ValueError
         ### ~ [2] ~ Process GO ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (gx,gtot,ix) = (0.0,len(self.dict[gokey]),0)
         for gene in rje.sortKeys(self.dict[gokey]):
             self.progLog('\r#GENES','Making GO Gene lists: %.1f%%' % (gx/gtot)); gx += 100.0
             for go in self.dict[gokey][gene]:
                 if gokey in self.dict['GO'][go]: self.dict['GO'][go][gokey].append(gene)
                 else: self.dict['GO'][go][gokey] = [gene]; ix += 1
         self.printLog('\r#GENES','Making GO Gene lists complete: %s GO IDs with genes' % rje.integerString(ix))
     except: self.errorLog('Major problem with GO.makeGOGenes()')
예제 #43
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = self.obj['SeqList']
         if self.getStr('Basefile').lower() in ['','none']:
             self.str['Basefile'] = rje.baseFile(seqlist.getStr('Name'))
             self.obj['DB'].setInfo({'Basefile':self.str['Basefile']})
         ## ~ [1a] Genetic Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         cdb = self.db().addEmptyTable('Code',['Codon','AA'],['Codon'])
         for codon in rje_sequence.genetic_code: cdb.addEntry({'Codon':codon,'AA':rje_sequence.genetic_code[codon]})
         cdb.index('AA')
         ### ~ [2] Calculate Codon Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         codons = rje.sortKeys(rje_sequence.genetic_code)
         db = self.db().addEmptyTable('Codons',['Seq','Len']+codons,['Seq'])
         sx = 0.0; seqx = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#COD','Calculating codon usage: %.2f%%' % (sx/seqx)); sx += 100.0
             entry = rje_sequence.codons(seq.getSequence(),{})
             #self.deBug(entry); self.deBug(entry.values())
             entry['Len'] = sum(entry.values())
             entry['Seq'] = seq.getStr('AccNum')
             db.addEntry(entry)
         self.printLog('\r#COD','Codon usage calculated for %s sequences' % rje.iStr(seqx))
         db.fillBlanks(blank=0,fillempty=True)
         db.saveToFile()
         ### ~ [3] Calculate NT Count Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         nt = ['C','A','G','U']
         for i in [1,2,3]:
             for n in ['C','A','G','U']: nt.append('%s|%d' % (n,i))
         ndb = self.db().addEmptyTable('NT',['Seq','Len']+nt,['Seq'])
         sx = 0.0; seqx = seqlist.seqNum()
         for seq in seqlist.seqs():
             self.progLog('\r#NT','Calculating NT Counts: %.2f%%' % (sx/seqx)); sx += 100.0
             entry = rje_sequence.aaFreq(string.replace(seq.getSequence(),'T','U'),{'C':0,'A':0,'G':0,'U':0},False)
             entry['Len'] = sum(entry.values())
             entry['Seq'] = seq.getStr('AccNum')
             centry = db.data(entry['Seq'])
             for i in [1,2,3]:
                 for n in ['C','A','G','U']: entry['%s|%d' % (n,i)] = 0
             for codon in codons:
                 for i in [1,2,3]:
                     n = codon[i-1]
                     entry['%s|%d' % (n,i)] += centry[codon]
             ndb.addEntry(entry)
         self.printLog('\r#NT','NT Counts calculated for %s sequences' % rje.iStr(seqx))
         ndb.saveToFile()
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
예제 #44
0
 def loadHHPID(self):    ### Load HHPID interactions
     '''Load HHPID interactions.'''
     try:### ~ [1] Setup HHPID Table ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getStr('HHPID').lower() in ['','none']: return self.printLog('#HHPID','No HHPID file to load')
         hdb = self.db().addTable(self.getStr('HHPID'),mainkeys='auto',datakeys='All',name='HHPID')
         for field in ['#Tax ID 1','Tax ID 2','product accession.version 2','last update timestamp']: hdb.dropField(field)
         hdb.renameField('Gene ID 1','EntrezHIV')
         hdb.renameField('product accession.version 1','AccHIV')
         hdb.renameField('product name 1','HIV')
         hdb.renameField('Interaction short phrase','Interaction')
         hdb.renameField('Gene ID 2','Entrez')
         hdb.renameField('product name 2','Description')
         hdb.renameField('PubMed ID (PMID) list','PMID')
         for itype in rje.sortKeys(hdb.index('Interaction')): self.printLog('#HHPID','%s => %s entries' % (itype, len(hdb.index('Interaction')[itype])))
         hdb.dropEntriesDirect('Interaction',['binds','complexes with','interacts with'],inverse=True)
         return True
     except: self.errorLog('%s.loadHHPID error' % self)
예제 #45
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt'
         self.progLog('\r#PPI','Loading pairwise data...')
         pairwise = rje.dataDict(self,ppipairwise,['Hub','Spoke'],['Spoke','SpokeSeq','Evidence'])
         gene2seq = {}; seq2gene = {}
         fullppi = {}; px = 0.0; ptot = len(pairwise); ppix = 0
         for pair in rje.sortKeys(pairwise):
             self.progLog('\r#PPI','Processing full pairwise PPI: %.2f%%' % (px/ptot)); px += 100.0
             [hub,spoke] = string.split(pair,'\t')
             if spoke not in gene2seq:
                 sseq = pairwise[pair]['SpokeSeq']
                 gene2seq[spoke] = sseq; seq2gene[string.split(sseq,'__')[0]] = spoke
             if hub not in fullppi: fullppi[hub] = {}
             if spoke not in fullppi[hub]: fullppi[hub][spoke] = pairwise.pop(pair)['Evidence']; ppix += 1
         self.printLog('\r#PPI','Processed full pairwise PPI: %s genes; %s ppi.' % (rje.integerString(len(fullppi)),rje.integerString(ppix/2)))
         ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt'
         goodppi = self.loadFromFile(goodppifile,chomplines=True)
         self.dict['PPI'] = {}
         px = 0.0; ptot = len(fullppi); fppix = ppix; ppix = 0
         for hub in fullppi:
             self.progLog('\r#PPI','Filtering complexes: %.2f%% (%s hubs; %s ppi)' % (px/ptot,rje.integerString(len(self.dict['PPI'])),rje.integerString(ppix))); px +=100.0
             self.dict['PPI'][hub] = []
             for spoke in fullppi[hub]:
                 goodspoke = False
                 for ptype in goodppi:
                     if rje.matchExp(':(%s)($|\|)' % ptype, fullppi[hub][spoke]): goodspoke = True; break
                 if goodspoke: self.dict['PPI'][hub].append(spoke); continue
                 goodspoke = True
                 for spoke2 in fullppi[hub]:
                     if spoke2 in [hub,spoke]: continue
                     if spoke2 in fullppi[spoke]: goodspoke = False; break
                 if goodspoke: self.dict['PPI'][hub].append(spoke)
             ppix += len(self.dict['PPI'][hub])
             if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub)
         self.printLog('\r#PPI','Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' % (rje.integerString(len(fullppi)),rje.integerString(len(self.dict['PPI'])),rje.integerString(fppix/2),rje.integerString(ppix/2)))
         ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas'
         scmd = ['accnr=F','seqnr=F','seqin=%s' % seqfile] + self.cmd_list + ['autoload=T']
         seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,scmd)
         self.dict['SeqObj'] = seqlist.seqNameDic('Max')
         self.dict['Gene2Seq'] = gene2seq; self.dict['Seq2Gene'] = seq2gene
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
예제 #46
0
 def readSLiMSearch(self):   ### Reads SLiMSearch results into data dictionary
     '''Reads SLiMSearch results into data dictionary.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sumfile = '%s.summary.csv' % self.info['ResFile']
         occfile = '%s.csv' % self.info['ResFile']
         if not os.path.exists(sumfile): return self.errorLog('No Summary file "%s"!' % sumfile,printerror=False)
         if not os.path.exists(occfile): return self.errorLog('No Occurrence file "%s"!' % occfile,printerror=False)
         ### ~ [2] Read Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         esum = rje.dataDict(self,sumfile,mainkeys=['Motif'],datakeys='All',getheaders=False)
         occmotifs = []      # List of motifs with enough occurrences
         for motif in rje.sortKeys(esum):
             if string.atoi(esum[motif]['N_Occ']) < self.stat['MinOcc']: continue
             occmotifs.append(motif)
         ### ~ [3] Read Occurrences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#MOTIF','%d motifs with N_Occ >= MinOcc (%d)' % (len(occmotifs),self.stat['MinOcc']))
         self.readSLiMSearchOcc(occmotifs)
     except: self.log.errorLog(rje_zen.Zen().wisdom())
예제 #47
0
 def expectedCodonUsage(self):     ### Calculate expected codon usage from Frequency data
     '''Calculate expected codon usage from Frequency data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         aacode = self.db('Code').index('AA')
         nt = ['C','A','G','U']; codons = rje.sortKeys(rje_sequence.genetic_code)
         cdb = self.db('Codons'); ndb = self.db('NT')
         nsumdb = self.db().copyTable(ndb,'NTPos',replace=True)
         nsumdb.dropField('Len')
         for n in ['C','A','G','U']: nsumdb.renameField(n,'%s|All' % n)
         nsumdb.reshapeLong('Pos',reshape=['C','A','G','U'])
         nsumdb.compress(['Pos'],{'Pos':'str','Seq':'str'},default='sum')
         nsumdb.dropField('Seq'); nsumdb.addField('Total')
         for entry in nsumdb.entries():
             pos = entry.pop('Pos'); entry.pop('Total')
             rje.dictFreq(entry)
             entry['Pos'] = pos
         nsumdb.saveToFile()
         nexentry = nsumdb.data('3')
         fdb = self.db().addEmptyTable('Freq',['Seq','Len']+nt+codons+['Total'],['Seq'])
         edb = self.db().copyTable(cdb,'Expected',replace=True)
         ### ~ [2] Calculate Frequencies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         x = 0.0; etot = cdb.entryNum()
         for oldentry in cdb.entries():
             self.progLog('\r#FREQ','Calculating Frequencies: %.2f%%' % (x/etot)); x += 100.0
             entry = rje.combineDict({},oldentry)
             seq = entry['Seq']; entry['Total'] = entry.pop('Len')
             exentry = edb.data(seq)
             ntentry = rje.combineDict({},ndb.data()[seq])
             ntentry.pop('Seq'); ntentry.pop('Len')
             rje.dictFreq(ntentry)
             ntentry['Len'] = ntentry.pop('Total')
             for aa in aacode:
                 ax = 0.0; ex = 0.0
                 for codon in aacode[aa]:
                     ax += entry[codon]
                     exentry[codon] = nexentry[codon[0]] * nexentry[codon[1]] * nexentry[codon[2]]
                     ex += exentry[codon]
                 for codon in aacode[aa]:
                     if ax: entry[codon] = len(aacode[aa]) * entry[codon] / ax
                     else: entry[codon] = 0.0
                     exentry[codon] = ax * (exentry[codon] / ex)
             fdb.addEntry(rje.combineDict(entry,ntentry))
         self.printLog('\r#Freq','Frequencies calculated for %s entries' % rje.iStr(etot))
         fdb.saveToFile(); edb.saveToFile()
     except: self.errorLog('%s.expectedCodonUsage error' % self)
예제 #48
0
 def reduceGO(self):   ### Reduce GO terms to those with enough sequences
     '''Reduce GO terms to those with enough sequences.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         go = self.obj['GO']
         minocc = self.stat['MinOcc']
         maxocc = self.stat['MaxGenes']
         gokey = 'EnsGO'
         ### ~ [2] ~ Reduce GO ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for id in rje.sortKeys(go.dict['GO']):
             try:
                 idgenes = go.getGOGenes(id,gokey)
                 self.deBug('%s: %s' % (id,idgenes))
                 if len(idgenes) < minocc or len(idgenes) > maxocc:   # Remove
                     go.dict['GO'].pop(id)
                     for gene in idgenes:
                         go.dict[gokey][gene].remove(id)
                         if not go.dict[gokey][gene]: go.dict[gokey].pop(gene)
             except: self.errorLog('GOER.reduceGO(%s) problem' % id)
     except: self.errorLog(rje_zen.Zen().wisdom())
예제 #49
0
 def topTerms(self,slimx=20,parents=False,total='Total',countkey='counts'):  ### Selects top terms for GO slim set
     '''
     Selects top terms for GO slim set.
     >> slimx:int [20] = Desired min. number of terms for each GO domain.
     >> parents:bool [False] = Whether parents and children both allowed in list
     >> total:str ['Total'] = Sample containing Total counts for assessment
     >> countkey:str ['counts'] = Key identifying count dictionary for each GO term and 'total' count sample
     - self.go(id)[countkey] = {Sample:count}
     << returns a list of GO IDs that meet criteria
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         #x#self.opt['DeBug'] = True
         terms = []                          # List of terms
         dom = {'cc':{},'bp':{},'mf':{}}     # Dictionary of {domain:{count:[IDs]}}
         for id in self.go():
             n = self.go(id)[countkey][total]
             type = self.go(id)['type']
             if n not in dom[type]: dom[type][n] = [id]
             else: dom[type][n].append(id)
         ### ~ [2] ~ Generate Top Terms ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.deBug(dom)
         for type in dom:
             dterms = []                     # Terms for this domain only
             dkeys = rje.sortKeys(dom[type]) # Counts, low to high
             dkeys.reverse()                 # Counts, high to low
             (dx,dtot) = (0.0,len(dkeys))
             while dkeys and len(dterms) < slimx: # Keep looping
                 self.deBug('%s: %s' % (type,dterms))
                 self.progLog('#TOP','Generating top %d %s terms: %.1f%%' % (slimx,type,dx/dtot))
                 dx += 100.0
                 n = dkeys.pop(0)            # Remove from list
                 dterms += dom[type][n]      # Add terms to term list
                 if parents: continue        # Don't care if parents and children all mixed up
                 for id in dterms[0:]:
                     if id not in dterms: continue               # Previously-removed parent
                     for par in self.parents(id):                # Check all parents
                         if par in dterms: dterms.remove(par)    # Remove parent term
             self.printLog('\r#TOP','Identified %s top %s terms: >= %s genes' % (rje.iLen(dterms),type,rje.iStr(n)))
             terms += dterms                 # Found a stable list of terms
         self.deBug(terms)
         return terms
     except: self.errorLog('Major problem with GO.topTerms()')
     return []
예제 #50
0
 def output(self, seq, outfile, occdata=[]):  ### Output to file
     '''Output to file.'''
     try:  ### ~ [1] ~ Basic Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.opt['OccOnly'] and not occdata: return
         odata = [
             'Name\t%s' % (seq.shortName()),
             'Sequence\t%s' % (seq.getSequence(gaps=False)),
             'Output\t%s' %
             (string.join(string.split(outfile, '.')[:-1], '.')),
             'RE\t%s' % (string.join(self.list['PlotRE'], ',')),
             'TrueELMs\tY',
             'Description\t%s' % (seq.info['Description']),
             '',
         ]
         ### ~ [2] ~ PlotStats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for plot in rje.sortKeys(seq.dict['PlotStat']):
             odata.append(
                 'Plot\t%s\t%s' %
                 (plot, string.join(seq.dict['PlotStat'][plot], ', ')))
         if seq.dict['PlotStat']: odata.append('')
         ### ~ [3] ~ PlotFT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if seq.obj['Entry']:
             for ft in seq.obj['Entry'].list['Feature']:
                 if ft['Type'] in self.list['PlotFT']:
                     odata.append(
                         'Region\t%s %s\t%s:%s' %
                         (ft['Type'], ft['Desc'], ft['Start'], ft['End']))
             odata.append('')
         ### ~ [4] ~ MotifOcc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if occdata:
             for occ in occdata:
                 odata.append(
                     'Motif\t%s\t%s:%s' %
                     (occ['Pattern'], occ['Start_Pos'], occ['End_Pos']))
         ### ~ [5] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         open(outfile, 'w').write(string.join(odata, '\n'))
         self.printLog('#PLOT', 'SeqPlot output saved as %s' % (outfile))
     except:
         self.errorLog(rje_zen.Zen().wisdom())
예제 #51
0
 def checkSeqList(self):  ### Check sequence integrity
     '''Check sequence integrity.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqdict = self.dict['SeqDict']
         ### ~ [2] Check PPI data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         missing = []
         self.progLog('\r#MISS',
                      'Checking PPI IDs: %d missing' % len(missing))
         for p in rje.sortKeys(self.dict['PPI']):
             if p not in seqdict:
                 missing.append(p)
                 self.progLog('\r#MISS',
                              'Checking PPI IDs: %d missing' % len(missing))
         self.printLog(
             '\r#MISS',
             'Checking PPI IDs complete: %d missing' % len(missing))
         open('yeast.ppi.missing.txt',
              'w').write(string.join(missing, '\n'))
         ### ~ [3] Check Pillar data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         missing = []
         self.progLog('\r#MISS',
                      'Checking Pillar IDs: %d missing' % len(missing))
         for pillar in self.list['Pillars']:
             for p in pillar:
                 if p not in seqdict:
                     missing.append(p)
                     self.progLog(
                         '\r#MISS',
                         'Checking Pillar IDs: %d missing' % len(missing))
         self.printLog(
             '\r#MISS',
             'Checking Pillar IDs complete: %d missing' % len(missing))
         open('yeast.pillar.missing.txt',
              'w').write(string.join(missing, '\n'))
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible