Пример #1
0
 def sgd2sp(
     self
 ):  ### Reformats yeast sequence names and outputs new data for GOPHER
     '''Reformats yeast sequence names and outputs new data for GOPHER.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         inseq = self.obj['SeqList']
         uni = rje_uniprot.UniProt(self.log,
                                   self.cmd_list + ['datout=None'])
         xref = self.db('XRef')
         self.dict['Rename'] = {}
         ## ~ [1a] ~ Check or Make UniProt extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         ufile = '%s.dat' % self.info['Basefile']
         if os.path.exists(ufile) and not self.opt['Force']:
             uni.readUniProt(ufile, clear=True, cleardata=False)
         else:
             uni.readUniProt(clear=True,
                             acclist=rje.sortKeys(xref.index('UniProt')),
                             cleardata=False)
             uni.saveUniProt(ufile)
         ## ~ [1b] ~ Make dictionary of UniProt sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniseq = {}
         for entry in uni.entries():
             seq = entry.obj['Sequence']
             uniseq[seq.info['AccNum']] = seq
         self.printLog(
             '\r#USEQ',
             '%s UniProt Sequences extracted (%s Ensembl AccNum)' %
             (rje.iStr(len(uniseq)), rje.iStr(len(xref.index('UniProt')))))
         ### ~ [2] ~ Reformat sequences and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         yseq = []  # List of YEAST sequence objects
         (sx, stot) = (0.0, inseq.seqNum())
         for seq in inseq.seqs():
             self.progLog(
                 '\r#SEQ',
                 'Reformatting sequence names: %.2f%%' % (sx / stot))
             sx += 100.0
             if seq.info['SpecCode'] != 'YEAST': continue
             yseq.append(seq)
             sgd = seq.info['AccNum']
             newname = seq.info['Name']
             try:
                 for x in xref.indexEntries('EnsG', sgd):
                     acc = x['UniProt']
                     if acc:
                         newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:%s]' % (
                             seq.info['Name'], x['Gene'], x['EnsG'],
                             x['SGD'], acc)
                     else:
                         newname = '%s [Gene:%s EnsG:%s SGD:%s AccNum:-]' % (
                             seq.info['Name'], x['Gene'], x['EnsG'],
                             x['SGD'])
                         continue
                     if acc not in uniseq:
                         self.printLog(
                             '\r#UNIERR',
                             'Unable to find UniProt sequence %s (%s)' %
                             (acc, sgd))
                         continue
                     useq = uniseq[acc]
                     if useq.info['Sequence'] != seq.info['Sequence']:
                         self.printLog(
                             '\r#SEQERR',
                             '%s sequence <> %s sequence' % (sgd, acc))
                         continue
                     nsplit = string.split(newname)
                     nsplit[0] = '%s__%s' % (x['UniprotID'], acc)
                     newname = string.join(nsplit)
                     self.dict['Rename'][sgd] = acc
                     break
             except:
                 self.errorLog('%s problem' % sgd)
             seq.info['Name'] = newname
             seq.extractDetails(gnspacc=True)
         self.printLog('\r#SEQ', 'Reformatting sequence names complete.')
         ## ~ [2a] ~ Save renamed sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not rje.exists('%s.ygob.fas' % self.info['Basefile']):
             inseq.saveFasta(seqfile='%s.ygob.fas' % self.info['Basefile'])
         if not rje.exists('%s.yeast.fas' % self.info['Basefile']):
             inseq.saveFasta(seqs=yseq,
                             seqfile='%s.yeast.fas' % self.info['Basefile'])
         self.list['YeastSeq'] = inseq.accList(yseq)
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Пример #2
0
    def setup(self):  ### Main class setup method.
        '''Main class setup method.'''
        try:  ### ~ [1] ~ Setup Program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.obj['Prog'] = None
            prog = self.getStrLC('Name')
            if prog in mod:
                i = self.obj['ProgInfo'] = mod[prog].makeInfo()
                self.printLog(
                    '#PROG',
                    '%s V%s: %s' % (i.program, i.version, i.description))
                progcmd = rje.getCmdList(
                    [], info=i) + self.cmd_list + ['newlog=F']
                out = rje.Out(cmd_list=progcmd)
                out.printIntro(i)
                #self.debug(prog); self.debug(progcmd)
                if self.getBool('Help'):
                    progcmd = mod[prog].cmdHelp(i, out, ['help'] + progcmd)
                self.printLog('#CMD',
                              'Full %s CmdList: %s' %
                              (i.program,
                               rje.argString(
                                   rje.tidyArgs(progcmd,
                                                nopath=self.getStrLC('Rest')
                                                and not self.dev(),
                                                purgelist=purgelist))),
                              screen=False)
                #self.debug(prog); self.debug(progcmd)
                ## ~ [1a] ~ Make self.obj['Prog'] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if prog in ['seqlist', 'rje_seqlist']:
                    self.obj['Prog'] = rje_seqlist.SeqList(self.log, progcmd)
                elif prog in ['uniprot', 'rje_uniprot']:
                    self.obj['Prog'] = rje_uniprot.UniProt(self.log, progcmd)
                elif prog in ['taxonomy', 'rje_taxonomy']:
                    self.obj['Prog'] = rje_taxonomy.Taxonomy(self.log, progcmd)
                elif prog in ['tree', 'rje_tree']:
                    self.obj['Prog'] = rje_tree.Tree(self.log, progcmd)
                elif prog in ['xref', 'rje_xref']:
                    self.obj['Prog'] = rje_xref.XRef(self.log, progcmd)
                elif prog in ['seq', 'rje_seq']:
                    self.obj['Prog'] = rje_seq.SeqList(self.log, progcmd)
                elif prog in ['mitab', 'rje_mitab']:
                    self.obj['Prog'] = rje_mitab.MITAB(self.log, progcmd)
                elif prog in ['dbase', 'database']:
                    self.obj['Prog'] = rje_dbase.DatabaseController(
                        self.log, progcmd)
                elif prog in ['pydocs']:
                    self.obj['Prog'] = rje_pydocs.PyDoc(self.log, progcmd)
                elif prog in ['ensembl', 'rje_ensembl']:
                    self.obj['Prog'] = rje_ensembl.EnsEMBL(self.log, progcmd)
                elif prog in ['genbank', 'rje_genbank']:
                    self.obj['Prog'] = rje_genbank.GenBank(self.log, progcmd)
                elif prog in ['extatic']:
                    self.obj['Prog'] = extatic.ExTATIC(self.log, progcmd)
                elif prog in ['revert']:
                    self.obj['Prog'] = revert.REVERT(self.log, progcmd)
                elif prog in ['fiesta']:
                    self.obj['Prog'] = fiesta.FIESTA(self.log, progcmd)
                elif prog in ['gablam']:
                    self.obj['Prog'] = gablam.GABLAM(self.log, progcmd)
                elif prog in ['gopher']:
                    self.obj['Prog'] = gopher.Gopher(self.log, progcmd)
                elif prog in ['haqesac']:
                    self.obj['Prog'] = haqesac.HAQESAC(self.log, progcmd)
                elif prog in ['multihaq']:
                    self.obj['Prog'] = multihaq.MultiHAQ(self.log, progcmd)
                elif prog in ['pingu']:
                    self.obj['Prog'] = pingu.PINGU(self.log, progcmd)
                elif prog in ['pacbio']:
                    self.obj['Prog'] = rje_pacbio.PacBio(self.log, progcmd)
                elif prog in ['rje_zen', 'zen']:
                    self.obj['Prog'] = rje_zen.Zen(self.log, progcmd)

            ### ~ [2] ~ Failure to recognise program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not self.obj['Prog']:
                self.printLog(
                    '#ERR',
                    'Program "%s" not recognised.' % self.getStr('Name'))
                if self.i() < 0: return False
                if rje.yesNo('Show SeqSuite help with program options?'):
                    extracmd = cmdHelp(cmd_list=['help'])[1:]
                    if extracmd:
                        self.cmd_list += extracmd
                        self._cmdList()
                        if prog != self.getStrLC('Name'): return self.setup()
                self.setStr({
                    'Name':
                    rje.choice('Give program name (Blank or CTRL+C to quit)')
                })
                if self.getStrLC('Name'): return self.setup()
                else: return False
            return self.obj['Prog']  # Setup successful
        except KeyboardInterrupt:
            return False
        except SystemExit:
            raise
        except:
            self.errorLog('Problem during %s setup.' % self.prog())
            return False  # Setup failed
Пример #3
0
 def addRealUniProt(
     self, seq, udata, ftlist
 ):  ### Updates feature list ft using real UniProt where possible and makes NR
     '''
     Updates feature list ft using real UniProt where possible and makes NR.
     >> seq:Sequence object = target of UniFake
     >> udata:UniProt Entry Data dictionary *Modified in place*
     >> ftlist:list of feature dictionaries to add to (and make NR) *Modified in place*
     '''
     try:  ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.list['UniReal']: return
         realuni = rje_uniprot.UniProt(self.log,
                                       self.cmd_list + ['datout=None'])
         realuni.readUniProt(
             clear=True,
             acclist=[seq.shortName(), seq.info['ID'], seq.info['AccNum']],
             cleardata=False)
         if not realuni.list['Entry']:
             self.printLog('#UNI',
                           'No Real AccNum for %s' % seq.shortName())
         sequence = seq.info['Sequence'][0:]
         ### ~ [1] ~ Map and Add Features from actual UniProt entry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for entry in realuni.list['Entry']:
             if 'uniprot' not in self.list['UniFake']: break
             for key in self.list[
                     'UniReal']:  #['AC','GN','RC','RX','CC','DR','PE','KW']:
                 if entry.dict['Data'].has_key(key):
                     if udata.has_key(key):
                         udata[key] = entry.dict['Data'][key] + udata[key]
                     else:
                         udata[key] = entry.dict['Data'][key][0:]
             for ft in entry.list['Feature'][0:]:
                 ft_start = ft['Start']
                 ft_end = ft['End']
                 ft_seq = entry.obj['Sequence'].info['Sequence'][ft_start -
                                                                 1:ft_end]
                 if ft_seq == sequence[ft_start - 1:ft_end]:
                     ftlist.append(ft)
                     continue
                 if not self.opt['FudgeFT']: continue
                 fudge = 1
                 while fudge:
                     if ft_start - fudge < 1 and ft_end + fudge > len(
                             sequence):
                         fudge = 0
                         break
                     if ft_start - fudge >= 1 and ft_seq == sequence[
                             ft_start - 1 - fudge:ft_end - fudge]:
                         fudge = -fudge
                         break
                     if ft_end + fudge <= len(
                             sequence) and ft_seq == sequence[ft_start - 1 +
                                                              fudge:ft_end +
                                                              fudge]:
                         break
                     fudge += 1
                 if fudge:
                     ft['Start'] = ft_start + fudge
                     ft['End'] = ft_end + fudge
                     ftlist.append(ft)
         ### ~ [2] ~ Make FT list NR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         i = 0
         while i < len(ftlist):
             if ftlist.count(ftlist[i]) > 1: ftlist.pop(i)
             else: i += 1
     except:
         self.errorLog('UniFake.addRealUniProt error [%s]' %
                       seq.shortName())
Пример #4
0
 def uniFake(
     self,
     seqs=[],
     store=False
 ):  ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs.
     '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         unifake = string.split(string.join(self.list['UniFake']).lower())
         seqlist = self.obj['SeqList']
         if seqs: seqlist.seq = seqs
         else: seqs = seqlist.seq
         (sx, seqnum) = (0, seqlist.seqNum())
         ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = rje_uniprot.UniProt(
             self.log, self.cmd_list)  # UniProt object for saving data
         if self.info['DatOut'].lower() in ['', 'none']:
             self.info['DatOut'] = rje.baseFile(
                 seqlist.info['Name']) + '.dat'
         datfile = self.info['DatOut']
         if os.path.exists(datfile): rje.backup(self, datfile)
         if store: seqlist.obj['UniProt'] = uniprot
         ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'pfam' in unifake:
             hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T'])
             hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile)
             if os.path.exists(hmmfile): rje.backup(self, hmmfile)
             hmm.list['HMM'] = [self.info['PFam']]
             hmm.opt['HMMPFam'] = True
         else:
             hmm = None
         ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list)
         else: tm = None
         ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in seqs:
             sx += 1
             name = seq.shortName()
             self.printLog(
                 '#SEQ', 'Processing %s (%s aa) %s...' %
                 (seq.shortName(), rje.integerString(
                     seq.aaLen()), seq.info['Description'][:50]))
             try:
                 ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 utmp = 'tmp%s.%s' % (rje.randomString(5),
                                      seq.info['AccNum'])
                 open('%s.fas' % utmp, 'w').write(
                     '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence']))
                 udata = {
                     'CC': ['-!- Features generated using unifake.py'],
                     'AC': []
                 }
                 if seq.info['SpecCode'] in ['Unknown', 'UNK']:
                     seq.info['SpecCode'] = self.info['SPCode']
                 #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']]     #!# Check how well this works. Add spectable? #!#
                 ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['EnsDat'] and rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name']):
                     details = rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name'])
                     self.addAlias(seq.info['AccNum'], details[0])
                     self.addAlias(seq.info['AccNum'], details[1])
                     self.addAlias(seq.info['AccNum'], details[2])
                     udata['GN'] = [details[2]]
                 for id in [seq.shortName(), seq.info['AccNum']]:
                     if id in self.dict['Aliases']:
                         udata['AC'].append(
                             '%s;' %
                             string.join(self.dict['Aliases'][id], '; '))
                 ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 ft = []  # List of features for sequence
                 for id in [
                         seq.shortName(), seq.info['AccNum'], seq.info['ID']
                 ]:
                     if id in self.dict['Features']:
                         ft += self.dict['Features'][id]
                 ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'disorder' in self.list['UniFake']:
                     try:
                         seq.disorder()
                         dis = seq.obj['Disorder']
                         for disorder in seq.obj['Disorder'].list[
                                 'RegionDisorder']:
                             ft.append({
                                 'Type':
                                 'DISORDER',
                                 'Desc':
                                 'Predicted disorder: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 disorder[0],
                                 'End':
                                 disorder[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s > %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                         for fold in seq.obj['Disorder'].list['RegionFold']:
                             ft.append({
                                 'Type':
                                 'ORDER',
                                 'Desc':
                                 'Predicted order: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 fold[0],
                                 'End':
                                 fold[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s <= %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                     except:
                         self.log.errorLog(
                             'UniFake disorder problem for %s.' % name)
                 ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if hmm:
                     try:
                         hmm.setInfo({
                             'SearchDB': '%s.fas' % utmp,
                             'HMMOut': '%s.hmm.out' % utmp
                         })  # This will be made for each sequence
                         hmm.search = []
                         hmm.list['HMMRes'] = [
                             hmm.hmmSearch(self.info['PFam'],
                                           outfile=hmm.info['HMMOut'])
                         ]  # Used in hmmTable
                         hmm.hmmTable(outfile=hmmfile, append=True)
                         if 'disorder' in self.list['UniFake']:
                             disorder = seq.obj['Disorder'].list[
                                 'ResidueDisorder']  # individual (IUPRed) residue results
                         else:
                             disorder = []
                         if hmm.search:
                             udata['CC'].append(
                                 'PFam: HMMer PFam search vs %s (Modified %s)'
                                 %
                                 (self.info['PFam'],
                                  time.ctime(
                                      os.path.getmtime(self.info['PFam']))))
                         else:
                             udata['CC'].append(
                                 '-!- ERROR: PFam HMMer Search failure!')
                             out = {'Type': '!ERROR!', 'Name': name}
                             rje.delimitedFileOutput(
                                 self,
                                 hmmfile, [
                                     'Type', 'Name', 'Start', 'End', 'Eval',
                                     'Score'
                                 ],
                                 datadict=out)
                         for search in hmm.search:
                             for hit in search.hit:
                                 for aln in hit.aln:
                                     pfamft = {
                                         'Start':
                                         aln.stat['SbjStart'],
                                         'End':
                                         aln.stat['SbjEnd'],
                                         'Type':
                                         'PFAM',
                                         'Desc':
                                         '%s PFam HMM Eval: %.2e; Score: %.1f'
                                         % (search.info['Name'],
                                            aln.stat['Expect'],
                                            aln.stat['BitScore'])
                                     }
                                     if disorder:
                                         region = disorder[
                                             aln.stat['SbjStart'] -
                                             1:aln.stat['SbjEnd']]
                                         hmmdisorder = float(
                                             sum(region)) / len(region)
                                         pfamft[
                                             'Desc'] = '%s; IUPRed: %.2f' % (
                                                 pfamft['Desc'],
                                                 hmmdisorder)
                                         if hmmdisorder < self.stat[
                                                 'DisDom']:
                                             pfamft['Type'] = 'DOMAIN'
                                     ft.append(pfamft)
                     except:
                         self.log.errorLog(
                             'UniFake PFam HMM problem for %s.' % name)
                 ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'tmhmm' in unifake:
                     try:
                         tmdat = os.popen(
                             '%s %s.fas -short' %
                             (self.info['TMHMM'], utmp)).readlines()
                         domlist = rje_tm.domainList(
                             rje_tm.parseTMHMM(tmdat[0]))
                         for tmdom in domlist:
                             ft.append(tmdom)
                             ft[-1]['Desc'] = 'TMHMM topology prediction'
                             ft[-1]['Start'] = string.atoi(ft[-1]['Start'])
                             ft[-1]['End'] = string.atoi(ft[-1]['End'])
                         if len(domlist) > 1:
                             udata['CC'].append(
                                 'TMHMM: %d TM domains; N-Term %s' %
                                 ((len(domlist) - 1) / 2,
                                  domlist[0]['Type']))
                         else:
                             udata['CC'].append('TMHMM: 0 TM domains')
                     except:
                         self.log.errorLog('UniFake TMHMM problem for %s.' %
                                           name)
                 ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'signalp' in unifake:
                     try:
                         os.system(
                             '%s -f short -t euk %s.fas > %s.signalp' %
                             (self.info['SignalP'], utmp, utmp))
                         tm.signalp = {}
                         tm.parseSignalP('%s.signalp' % utmp)
                         sigp = tm.signalp.pop(seq.shortName())
                         cpos = 0
                         if sigp['nn_ymax?'] == 'Y':
                             cpos = string.atoi(sigp['nn_ymaxpos'])
                             desc = 'SignalP NN prediction'
                         if sigp['hmm_cmax?'] == 'Y':
                             hmm_c = string.atoi(sigp['hmm_cmaxpos'])
                             if cpos == 0:
                                 cpos = hmm_c
                                 desc = 'SignalP HMM prediction'
                             else:
                                 if hmm_c < cpos:
                                     cpos = hmm_c
                                     desc = 'SignalP HMM prediction (NN also Y)'
                                 else:
                                     desc += ' (HMM also Y)'
                         if cpos > 0:
                             ft.append({
                                 'Type': 'SIGNALP',
                                 'Desc': desc,
                                 'Start': 1,
                                 'End': cpos
                             })
                     except:
                         self.log.errorLog(
                             'UniFake SignalP problem for %s.' % name)
                 ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.addRealUniProt(seq, udata, ft)
                 self.deBug(ft)
                 if not store: uniprot.list['Entry'] = []
                 if uniprot.addFromSeq(
                         seq, data=udata,
                         ft=ft):  ### Converts into UniProtEntry object
                     if not store: uniprot.saveUniProt(datfile, append=True)
                     #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName())
             ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             except:
                 self.log.errorLog('Problem during UniFake(%s)' % name)
             for tmp in glob.glob('%s*' % utmp):
                 os.unlink(tmp)
             self.printLog(
                 '#UNIFAKE',
                 '|---------- %s run <<<|>>> %s to go -----------|' %
                 (rje.integerString(sx), rje.integerString(seqnum - sx)),
                 log=False)
         if store: uniprot.saveUniProt(datfile, append=False)
         if self.opt['CleanUp']:
             for tmp in glob.glob('TMHMM*'):
                 if os.path.isdir(tmp): os.rmdir(tmp)
     except:
         self.errorLog(
             'Oh, the shame of it! Trouble during UniFake.uniFake()')
Пример #5
0
 def setup(self):  ### Loads data into attributes.
     '''Loads data into attributes.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ UniProt Object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = self.obj['UniProt'] = rje_uniprot.UniProt(
             self.log, self.cmd_list)
         uniprot.readUniProt()
         if uniprot.entryNum(
         ) > 0:  ### UniProt data loaded. Populate seqlist and domain dictionary.
             seqlist = rje_seq.SeqList(self.log,
                                       self.cmd_list + ['autoload=F'])
             for entry in uniprot.list['Entry']:
                 seq = entry.obj['Sequence']
                 seqlist.seq.append(entry.obj['Sequence'])
                 name = seq.shortName()
                 self.dict['Entry'][name] = entry
                 self.dict['Seq'][name] = seq
                 for ft in entry.list['Feature']:
                     if ft['Type'] in self.list['DomFT']:
                         try:
                             dom = string.split(ft['Desc'])[0]
                             if dom not in self.dict['Domain']:
                                 self.dict['Domain'][dom] = []
                             if name not in self.dict['Domain'][dom]:
                                 self.dict['Domain'][dom].append(name)
                         except:
                             self.errorLog('Trouble with %s feature %s' %
                                           (name, ft))
         ## ~ [1b] ~ SeqList only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             seqlist = rje_seq.SeqList(self.log, self.cmd_list)
             for seq in seqlist.seq:
                 name = seq.shortName()
                 self.dict['Entry'][name] = None
                 self.dict['Seq'][name] = seq
                 #!# Consider adding loading domains from a table #!#
         ## ~ [1c] ~ Add PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['PPI']  # Dictionary of ShortName-centred
         ppi = rje.dataDict(self, self.info['PPI'])
         for hub in ppi:
             if ppi[hub]['EnsLoci'] == '-': continue
             ens = ppi[hub]['EnsLoci']
             if ens not in self.dict['PPI']: self.dict['PPI'][ens] = []
             self.dict['Gene'][ens] = hub
             for gene in string.split(ppi[hub]['PPI'], ','):
                 if ppi[gene]['EnsLoci'] == '-': continue
                 if ppi[gene]['EnsLoci'] not in self.dict['PPI'][ens]:
                     self.dict['PPI'][ens].append(ppi[gene]['EnsLoci'])
         ## ~ [1d] ~ Add DDI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['DDI'] = {}
         if self.info['DDI'].lower() not in ['', 'none']:
             data = rje.dataDict(self,
                                 self.info['DDI'],
                                 mainkeys=['Name1'],
                                 datakeys=['Name2'],
                                 headers=[
                                     'Pfam1', 'Pfam2', 'Name1', 'Name2',
                                     'Acc1', 'Acc2', 'Code1', 'Code2'
                                 ],
                                 lists=True)
             ## ~ Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
             (dx, dtot) = (0.0, len(data))
             self.deBug(data)
             try:
                 rje.sortKeys(data)
             except:
                 self.errorLog('F**k', quitchoice=True)
             for p1 in rje.sortKeys(data):
                 self.progLog(
                     '\r#DDI',
                     'Parsing DDI from iPFam: %.1f%%' % (dx / dtot))
                 if p1 not in self.dict['DDI']: self.dict['DDI'][p1] = []
                 for p2 in data[p1]['Name2']:
                     if p2 not in self.dict['DDI']:
                         self.dict['DDI'][p2] = []
                     if p2 not in self.dict['DDI'][p1]:
                         self.dict['DDI'][p1].append(p2)
                     if p1 not in self.dict['DDI'][p2]:
                         self.dict['DDI'][p2].append(p1)
             self.printLog(
                 '\r#DDI', 'Parsing DDI from iPFam: %s domains' %
                 (rje.integerString(dtot)))
         ## ~ [1e] ~ Family data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['Fam'] = {}
         if self.info['Fam'].lower() not in ['', 'none']:
             data = rje.dataDict(self,
                                 self.info['Fam'],
                                 mainkeys=['Qry'],
                                 datakeys=['Hit'],
                                 lists=True)
             for qry in self.dict['Seq']:
                 self.dict['Fam'][qry] = []
                 if qry in data: self.dict['Fam'][qry] = data[qry]['Hit']
                 elif self.dict['Seq'][qry].info['AccNum'] in data:
                     self.dict['Fam'][qry] = data[
                         self.dict['Seq'][qry].info['AccNum']]['Hit']
                 if qry not in self.dict['Fam'][qry]:
                     self.dict['Fam'][qry].append(qry)
     except:
         self.errorLog('Problem with SLiMPID.setup()', quitchoice=True)
Пример #6
0
    def readPELM(
        self
    ):  ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.
        '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.'''
        try:  ### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            data = rje.dataDict(self,
                                self.info['PELM'],
                                mainkeys=['acc', 'position'])
            seqdict = {}  # Dictionary of Acc:Sequence

            ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pdict = self.dict['PhosphoSites']
            for dkey in data:
                ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                (acc, pos) = string.split(dkey)
                pos = string.atoi(pos)
                if acc not in pdict: pdict[acc] = {}
                if pos not in pdict[acc]: pdict[acc][pos] = {}
                ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if acc not in seqdict: seqdict[acc] = data[dkey]['sequence']
                elif seqdict[acc] != data[dkey]['sequence']:
                    self.log.printLog(
                        '#ERR', 'Warning. Sequence mismatch for %s' % acc)
                if 'aa' not in pdict[acc][pos]:
                    pdict[acc][pos]['aa'] = data[dkey]['code']
                elif pdict[acc][pos]['aa'] != data[dkey]['code']:
                    self.log.printLog(
                        '#ERR',
                        'Warning. PhosphoSite mismatch for %s at pos %d: %s not %s'
                        %
                        (acc, pos, data[dkey]['code'], pdict[acc][pos]['aa']))
                if data[dkey]['code'] != seqdict[acc][(pos - 1):pos]:
                    self.log.printLog(
                        '#ERR',
                        'Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s'
                        % (acc, pos, data[dkey]['code'],
                           seqdict[acc][pos - 1:pos]))

            ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            acclist = rje.sortKeys(seqdict)
            pelmuni = rje_uniprot.UniProt(self.log,
                                          self.cmd_list)  # UniProt entry
            unidict = pelmuni.accDict(
                acclist)  # Dictionary of {acc:UniProtEntry}
            pelmseq = rje_seq.SeqList(self.log, self.cmd_list +
                                      ['seqin=None'])  # SeqList object
            ## ~ [3b] Add one sequence for each AccNum and update seqdict  ~~~~~~~~~~~~~~~~~~~~~~~~ ##
            #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!#
            for acc in acclist:  #!# Make accdict of {acc:Seq} using unidict and seqlist #!#
                sequence = seqdict[acc]
                try:
                    uni = unidict[string.split(acc, '-')[0]]
                    desc = uni.obj['Sequence'].info['Description']
                    name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'], acc,
                                          desc)
                    if sequence != uni.obj['Sequence'].info['Sequence']:
                        self.log.printLog(
                            '#WARNING',
                            'Sequence mismatch for UniProt entry %s' % acc)
                except:
                    self.log.errorLog('Problem with %s' % acc)
                    name = '%s_UNK__%s' % (
                        acc, acc)  #!# Add sequences where UniProt missing #!#
                seqdict[acc] = pelmseq._addSeq(name, sequence)
            ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if self.opt['FilterSeq']:
                pelmseq.autoFilter()
                for acc in acclist:
                    if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc)
                acclist = rje.sortKeys(seqdict)
            ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not os.path.exists(
                    self.info['PELMFas']
            ) or self.stat['Interactive'] < 0 or rje.yesNo(
                    '%s exists: overwrite?' % self.info['PELMFas']):
                pelmseq.saveFasta(seqfile=self.info['PELMFas'])
            self.obj['SeqList'] = pelmseq
            self.obj['UniProt'] = pelmuni
        except:
            self.log.errorLog('Problem during PhosphoSeq.readPELM')