Exemplo n.º 1
0
 def run(self,imenu=False,outputmap=True,returndict=False):      ### Main controlling run Method
     '''
     Main controlling run Method.
     >> imenu:boolean = Whether to initiate interactive menu if appropriate [False].
     >> outputmap:boolean = Whether to output mapping into a file [True]
     >> returndict:boolean = Whether to return a dictionary of {searchname:mappedname} (no previous mapping) [False]
     '''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.setup(imenu): raise ValueError
         seqlist = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=T','seqmode=file'])
         if not seqlist.seqNum(): self.warnLog('No sequences loaded for mapping.'); return {}
         ## ~ [0a] Setup BLAST Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast = rje_blast.BLASTRun(self.log,['blaste=1e-4','blastv=20','blastf=F']+self.cmd_list+['v=-1'])
         blast.setStr({'DBase':self.getStr('MapDB'),'Type':'blastp','InFile':self.getStr('SeqIn'),
                      'Name':'%s-%s.blast' % (rje.baseFile(self.str['SeqIn'],True),rje.baseFile(self.str['MapDB'],True))})  
         blast.setStat({'HitAln':blast.getStat('OneLine')})
         blast.list['ResTab'] = ['Search','Hit','GABLAM']
         if seqlist.nt(): blast.str['Type'] = 'blastx'
         ## ~ [0b] Setup Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if outputmap: self._setupOutput()                           ## Output Files ##
         if returndict: mapdict = {}
         else: self._setupMapped()                                   ## Previously Mapped Sequences ##
         seqx = seqlist.seqNum()             ## Number of sequences ##
         ### ~ [1] BLAST Search Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#BLAST','BLASTing %s vs %s.\n *** This could take some time if files are large. Please be patient! ***' % (self.str['SeqIn'],self.str['MapDB']),log=False)
         ## ~ [1a] Perform BLAST Unless it exists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast.run(format=True)
         self.obj['DB'] = blast.obj['DB']
         ## ~ [1b] Mapping from searches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.debug(self.getStr('MapDB'))
         self.obj['MapDB'] = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=F','seqmode=file','seqin=%s' % self.str['MapDB']])
         self.obj['MapDB'].loadSeq(self.getStr('MapDB'))
         self.debug('%s' % self.obj['MapDB'].list['Seq'])
         sx = 0
         while seqlist.nextSeq() != None:
             search = seqlist.getSeq(format='short')
             sx += 1
             ## Check StartFrom ##
             if self.str['StartFrom']:
                 if self.str['StartFrom'] != search:
                     self.progLog('\r#SKIP','Looking for %s: skipping %d seqs' % (self.str['StartFrom'],sx))
                     continue
                 self.str['StartFrom'] = ''
                 self.printLog('\r#SKIP','Starting from %s: skipped %d seqs' % (self.str['StartFrom'],sx))
             ## Check if in Mapped ##
             if search in self.list['Mapped']:
                 resdict = {'Query':search,'Hit':search,'Method':'Already Mapped!'}
                 self.printLog('#FAS','%s already in output - not duplicating in %s' % (search,self.str['MapFas']))
                 rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
                 continue
             ### Map Sequence ###
             self.printLog('#MAP','Mapping %s seqs: %s of %s' % (self.str['SeqIn'],rje.integerString(sx),rje.integerString(seqx)))
             mapname = self.mapSeq(seqlist,blast,search)
             if returndict: mapdict[search] = mapname
         ### ~ [2] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#MAP','Mapping of %s (%s seqs) complete.' % (self.str['SeqIn'],rje.integerString(seqx)))           
         if os.path.exists(blast.str['Name']) and not (self.getBool('DeBug') or self.test()): os.unlink(blast.str['Name'])     #!# Add option to keep BLAST! #!#
         if returndict: return mapdict
     except: self.errorLog('Error in SeqMapper.run()',printerror=True,quitchoice=True); raise   
Exemplo n.º 2
0
 def seqinObj(self,
              summarise=True,
              gapstats=True
              ):  ### Returns the a SeqList object for the SeqIn file
     '''
     Returns the a SeqList object for the SeqIn file.
     :return: self.obj['SeqIn']
     '''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqbase = rje.baseFile(self.getStr('SeqIn'), strip_path=True)
         if not self.obj['SeqIn']:
             seqcmd = self.cmd_list
             if summarise: seqcmd += ['summarise=T', 'dna=T', 'raw=F']
             gapstats = gapstats and (
                 self.force() or not rje.exists('%s.gaps.tdt' % seqbase))
             if gapstats: seqcmd += ['gapstats']
             self.obj['SeqIn'] = rje_seqlist.SeqList(
                 self.log,
                 seqcmd + ['autoload=T', 'seqmode=file', 'autofilter=F'])
             # sx = 0.0; stot = self.obj['SeqIn'].seqNum()
             # for seq in self.obj['SeqIn'].seqs():
             #     self.progLog('\r#CHECK','Checking sequences names: %.1f%%' % (sx/stot)); sx += 100.0
             #     if '|' in self.obj['SeqIn'].shortName(seq):
             #         raise ValueError('Pipe "|" characters found in seqin=FILE names: will break program. Please rename and try again.')
             # self.printLog('\r#CHECK','Checking sequences names complete.')
     except ValueError:
         self.printLog('\r#CHECK', 'Checking sequences names aborted.')
         self.errorLog('DepthCharge input sequence error')
         raise
     except:
         self.errorLog('DepthCharge.seqinObj() error')
     return self.obj['SeqIn']
Exemplo n.º 3
0
 def batchSummarise(
     self
 ):  ### Batch run seqlist summarise on batchrun=LIST files and output table of results
     '''
     Batch run seqlist summarise on batchrun=LIST files and output table of results
     '''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.list['BatchRun']:
             raise ValueError(
                 'Need to provide batchrun=LIST files for summarise mode.')
         db = rje_db.Database(self.log, self.cmd_list)
         self.printLog('#BASE', db.baseFile())
         sdb = None
         if not self.force():
             sdb = db.addTable(mainkeys=['File'],
                               name='summarise',
                               expect=False)
         if not sdb: sdb = db.addEmptyTable('summarise', ['File'], ['File'])
         ### ~ [2] Run Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog(
             '#BATCH', 'Batch summarising %s input files' %
             rje.iLen(self.list['BatchRun']))
         for file in self.list['BatchRun']:
             seqdata = rje_seqlist.SeqList(
                 self.log, self.cmd_list +
                 ['seqin=%s' % file, 'autoload=T', 'summarise=F'
                  ]).summarise()
             if seqdata:
                 if 'GC' in seqdata:
                     seqdata.pop('GC')
                     seqdata['GCPC'] = '%.2f' % seqdata['GCPC']
                 if 'GapLength' in seqdata:
                     seqdata['GapPC'] = '%.2f' % (100.0 *
                                                  seqdata['GapLength'] /
                                                  seqdata['TotLength'])
                 seqdata['MeanLength'] = '%.1f' % seqdata['MeanLength']
                 for field in string.split(
                         'SeqNum, TotLength, MinLength, MaxLength, MeanLength, MedLength, N50Length, L50Count, GapLength, GapPC, GCPC',
                         ', '):
                     if field in seqdata and field not in sdb.fields():
                         sdb.addField(field)
                 for field in seqdata.keys():
                     if field not in sdb.fields(): sdb.addField(field)
                 sdb.addEntry(seqdata)
             else:
                 self.errorLog('Summarise failed for %s' % file,
                               printerror=False)
         ### ~ [3] Output Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb.saveToFile()
         return True
     except:
         self.errorLog('%s.batchSummarise error' % self)
         return False
Exemplo n.º 4
0
    def inSilicoHybrid(
        self
    ):  ### Filter and combine subreads from parent and output to fasta file.
        '''
        Filter and combine subreads from parent and output to fasta file.

        This module generates balanced "in silico diploid" PacBio subread data from two sequenced haploid parents. Each
        parent must first be run through SMRTSCAPE to generate subread summary data. (This will be performed if missing. Each
        parent needs a `*.fofn` file of subread file names, `*.unique.tdt` unique subreads table and `*.smrt.tdt` SMRT cell
        identifier table.)

        A new set of subreads is then generated from the combined set of parent subreads. This is done by first ranking the
        unique subreads from each parent by length. First, the longest subread from each parent are compared and the shortest
        selected to be the first subread of the diploid. (The shortest is taken to minimise length differences between the
        two parents.) Next, the longest subread from the next parent that is no longer than the previous subread is added.
        This cycles, picking a read from the the parent with fewest cumulative bases each cycle. The longest subread that is
        no longer than the previous subread is selected. This continues until one parent runs out of subreads. Additional
        subreads will be added from the other parent if they reduce the difference in cumulative output for each parent.

        Final output will be a `*.subreads.fasta` file in which each parent has a similar total sequence content and for
        which the subread length distributions should also be similar. This is to overcome biases in resulting diploid
        assemblies, where one parent has higher quality data than the other.

        NOTE: If performing downstream filtering by Read Quality (RQ), this might reintroduce a bias if one parent has much
        higher RQ values than the other. The `rqfilter=X` setting can therefore be used to restrict output to  reads with a
        minimum RQ value. By default this is 0.84. If you do not get enough sequence output, this setting may need to be
        relaxed.
        '''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [0a] Parent 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 1 ~~~~~~~~~~~~~~~~~~~~ #')
            self.printLog('#FOFN', 'Parent1: %s' % self.getStr('Parent1'))
            base1 = rje.baseFile(self.getStr('Parent1'))
            parent1 = smrtscape.SMRTSCAPE(
                self.log, ['genomesize=13.1e6'] + self.cmd_list +
                ['batch=%s' % self.getStr('Parent1'),
                 'basefile=%s' % base1])
            parent1.setup()
            udb1 = parent1.udb()
            cdb = parent1.db('smrt', add=True, mainkeys=['Name'])
            cdb.dataFormat({'SMRT': 'int'})
            cx = cdb.entryNum()
            ## ~ [0a] Parent 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 2 ~~~~~~~~~~~~~~~~~~~~ #')
            self.printLog('#FOFN', 'Parent2: %s' % self.getStr('Parent2'))
            base2 = rje.baseFile(self.getStr('Parent2'))
            parent2 = smrtscape.SMRTSCAPE(
                self.log, ['genomesize=13.1e6'] + self.cmd_list +
                ['batch=%s' % self.getStr('Parent2'),
                 'basefile=%s' % base2])
            parent2.setup()
            udb2 = parent2.udb()
            cdb2 = parent2.db('smrt', add=True, mainkeys=['Name'])
            cdb2.dataFormat({'SMRT': 'int'})
            # Shift all of the Parent2 SMRT IDs to avoid conflict with Parent1
            for entry in cdb2.entries() + udb2.entries():
                entry['SMRT'] = entry['SMRT'] + cx
            cdb = parent1.db().mergeTables(cdb, cdb2)
            ## ~ [0c] Output Sequence File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog(
                '#~~#',
                '# ~~~~~~~~~~~~~~~~~~~~ DIPLOIDOCUS SUBREADS ~~~~~~~~~~~~~~~~~~~~ #'
            )
            minlen = self.getInt('LenFilter')
            minrq = self.getNum('RQFilter')
            rqstr = '%s' % minrq
            filtfile = '%s.L%sRQ%s.fasta' % (self.baseFile(), minlen,
                                             rqstr[2:])
            ## ~ [0d] Input Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqbatch = []  # List of SeqList objects
            self.printLog(
                '#BATCH', '%s sequence files to process.' %
                rje.iLen(parent1.list['Batch'] + parent2.list['Batch']))
            for seqfile in parent1.list['Batch'] + parent2.list['Batch']:
                seqcmd = self.cmd_list + [
                    'seqmode=file', 'autoload=T', 'summarise=F',
                    'seqin=%s' % seqfile, 'autofilter=F'
                ]
                seqbatch.append(rje_seqlist.SeqList(self.log, seqcmd))
            self.printLog(
                '#BATCH',
                '%s sequence files to summarise.' % rje.iLen(seqbatch))
            if not seqbatch:
                raise IOError(
                    'No batch input fasta files found! Make sure parentN=FILE settings given *.fofn.'
                )
            ## ~ [0e] Setup subread lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            elists = [
                udb1.sortedEntries('Len', reverse=True),
                udb2.sortedEntries('Len', reverse=True)
            ]
            plen = [0, 0]  # Summed lengths for each parent
            pseq = [0, 0]  # Total sequence number for each parent
            prq = [0, 0]  # Total sequence RQ for each parent (convert to mean)
            if not elists[0] or not elists[1]:
                raise ValueError(
                    'No Unique ZMW subreads for one or both parents!')
            lastlen = max(elists[0][0]['Len'],
                          elists[1][0]['Len'])  # Length of last selected read
            for elist in elists:
                while elist and elist[0]['RQ'] < minrq:
                    elist.pop(0)
            if not elists[0] or not elists[1]:
                raise ValueError(
                    'No Unique ZMW subreads for one or both parents!')
            nextp = 0  # Index of next parent to use
            if elists[0][0]['Len'] < elists[1][0]['Len']: nextp = 1

            ### ~ [1] Filter and Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] Filter Unique Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            zmwlist = []  # List of (smrt,zmw) meeting filtering criteria
            ux = 0.0
            utot = len(elists[0]) + len(elists[1])
            while lastlen:
                self.progLog('\r#DIP',
                             'Diploidising subreads: %.2f%%' % (ux / utot))
                elist = elists[nextp]
                while elist and elist[0]['RQ'] < minrq:
                    elist.pop(0)
                    ux += 100.0
                if elist and elist[0]['Len'] < minlen:
                    ux += 100.0 * len(elist)
                    elist = []
                if not elist:
                    nextp = 1 - nextp
                    break  # Finish
                entry = elist.pop(0)
                ux += 100.0
                zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos']))
                plen[nextp] += entry['Len']
                prq[nextp] += entry['RQ']
                pseq[nextp] += 1
                if plen[1 - nextp] <= plen[nextp]: nextp = 1 - nextp
                lastlen = entry['Len']
            ## ~ [1b] Final processing of last reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            while elists[nextp]:
                elist = elists[nextp]
                while elist and elist[0]['RQ'] < minrq:
                    self.progLog('\r#DIP',
                                 'Diploidising subreads: %.2f%%' % (ux / utot))
                    elist.pop(0)
                    ux += 100.0
                while elist and elist[0]['Len'] >= minlen:
                    self.progLog('\r#DIP',
                                 'Diploidising subreads: %.2f%%' % (ux / utot))
                    entry = elist.pop(0)
                    ux += 100.0
                    pdiff = rje.modulus(plen[0] - plen[1])
                    ediff = rje.modulus(plen[nextp] + entry['Len'] -
                                        plen[1 - nextp])
                    if ediff >= pdiff:
                        elists[nextp] = []
                        break  #Finish!
                    zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos']))
                    plen[nextp] += entry['Len']
                    prq[nextp] += entry['RQ']
                    pseq[nextp] += 1
            self.printLog(
                '\r#DIP',
                'Diploidising subreads complete: %s subreads to output.' %
                rje.iLen(zmwlist))
            self.printLog(
                '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' %
                (self.getStr('Parent1'), rje.iStr(pseq[0]), rje.iStr(plen[0]),
                 1.0 * plen[0] / self.getInt('GenomeSize'), prq[0] / pseq[0]))
            self.printLog(
                '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' %
                (self.getStr('Parent2'), rje.iStr(pseq[1]), rje.iStr(plen[1]),
                 1.0 * plen[1] / self.getInt('GenomeSize'), prq[1] / pseq[1]))
            ## ~ [1b] Extract Filtered Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            rje.backup(self, filtfile)
            SEQOUT = open(filtfile, 'w')
            sx = 0.0
            stot = 0
            sn = len(seqbatch)
            fx = 0
            for seqlist in seqbatch:
                #>m150625_001530_42272_c100792502550000001823157609091582_s1_p0/9/0_3967 RQ=0.784
                si = 100.0 / seqlist.seqNum()
                stot += seqlist.seqNum()
                for seq in seqlist.seqs():
                    self.progLog('\r#OUT',
                                 'Extracting subreads: %.2f%%' % (sx / sn))
                    sx += si
                    (name, sequence) = seqlist.getSeq(seq)
                    try:
                        [smrt, zmw, pos,
                         rq] = string.split(string.replace(name, '/', ' '))
                    except:
                        [smrt, zmw,
                         pos] = string.split(string.replace(name, '/', ' '))
                        rq = minrq
                    if (cdb.data(smrt)['SMRT'], int(zmw), pos) not in zmwlist:
                        continue
                    SEQOUT.write('>%s\n%s\n' % (name, sequence))
                    fx += 1
            self.printLog(
                '\r#OUT',
                'Saved %s filtered subreads to %s.' % (rje.iStr(fx), filtfile))

            ### ~ [2] Summarise Filtered File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seqcmd = self.cmd_list + [
                'seqmode=file', 'autoload=T', 'summarise=T',
                'seqin=%s' % filtfile, 'autofilter=F'
            ]
            rje_seqlist.SeqList(self.log, seqcmd)

            return True
        except:
            self.errorLog('%s.run error' % self.prog())
            return False
Exemplo n.º 5
0
    def alignmentToLocal(self,alignment=[],protqry=False):    ### Converts alignment into local hits table
        '''
        Converts alignment into local hits table.
        >> alignment:list of alignment text strings parsed from exonerate output.
        >> protqry:bool[False] = Whether query is protein
        << returns local database table.
        '''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            vfields = ['Qry','Hit','AlnID','Score','Expect','Length','Identity','Positives','QryStart','QryEnd','HitStart','HitEnd','QrySeq','HitSeq','AlnSeq','Rank','Phase','HitStrand']
            vdb = self.db().addEmptyTable('local',vfields,['Qry','Hit','AlnID'])

            ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            '''
                     Query: FAXD1_NOTSC (P82807) Venom prothrombin activator notecarin-D1 [Notechis scutatus scutatus]
                    Target: ahap_PSETE__EBS10XV2AHAP187 haploidB edges=694320..157489 left=833615 right=281503 ver=1.9 style=4:[revcomp]
                     Model: protein2genome:local
                 Raw score: 1170
               Query range: 19 -> 295
              Target range: 12312786 -> 12307250
            
                   20 : AlaGluSerAsnValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg :       37
                        ..!...|||   ||||||||||||||||||||||||||||||||||||||||||
                        CysSerSerLeuValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg
             12312786 : TGTTCTTCTTTAGTATTCTTAAAAAGCAAAGTGGCAAATAGATTTTTGCAAAGA : 12312735
            
                  264 : {G}  >>>> Target Intron 7 >>>>  {ly}GluIleAspIleSerArg :      270
                        {|}           1304 bp           {||}|||||||||||||||!!!
                        {G}++                         ++{ly}GluIleAspIleSerSer
             12308652 : {G}gt.........................ag{GG}GAAATAGACATATCAAGC : 12307328
            
                  289 : ValProProAsnTyrTyrTyr :      295
                        |||||| !!!..||| !!|||
                        ValProAlaThrTyrAspTyr
             12307273 : GTTCCTGCCACGTATGACTAT : 12307251
            '''
            qry = None
            hit = None
            alnx = {}
            ventry = {}
            parsing = alignment[0:]
            rank = 1

            while parsing:
                line = parsing.pop(0)
                #self.bugPrint(line)
                # Query
                if rje.matchExp('Query: (\S+)',line):
                    if ventry: vdb.addEntry(ventry)
                    ventry = {'Qry':rje.matchExp('Query: (\S+)',line)[0],'QrySeq':'','HitSeq':'','AlnSeq':'','Rank':rank}
                    rank += 1
                # Hit
                if rje.matchExp('Target: (\S+)',line):
                    ventry['Hit'] = rje.matchExp('Target: (\S+)',line)[0]
                    qh = (ventry['Qry'],ventry['Hit'])
                    if qh in alnx: alnx[qh] += 1
                    else: alnx[qh] = 1
                    ventry['AlnID'] = alnx[qh]
                # Score
                if rje.matchExp('core: (\S+)',line):
                    ventry['Score'] = int(rje.matchExp('core: (\S+)',line)[0])
                # Alignment
                if rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line):
                    adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line)
                    #self.bugPrint('= new aln: %s ->  %s' % (adata[0],adata[2]))
                    start = int(adata[0])
                    end = int(adata[2])
                    aln = adata[1]
                    x = line.find(aln)
                    if 'QryStart' not in ventry: ventry['QryStart'] = start
                    ventry['QryEnd'] = end
                    ventry['QrySeq'] += aln
                    #self.bugPrint('^%s$' % ventry['QrySeq'])

                    line = parsing.pop(0)
                    #self.bugPrint(line)
                    #self.bugPrint(']%s[' % aln)
                    #self.bugPrint(']%s[' % line[x:x+len(aln)])
                    ventry['AlnSeq'] += line[x:x+len(aln)]
                    #self.debug('^%s$' % ventry['AlnSeq'])

                    #self.bugPrint(parsing[0])
                    adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0))
                    if not adata:
                        #self.deBug(parsing[0])
                        adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0))
                    if not adata: raise ValueError('Partial alignment! Truncated output?')
                    #self.bugPrint('+ hit aln: %s ->  %s' % (adata[0],adata[2]))
                    start = int(adata[0])
                    end = int(adata[2])
                    aln = adata[1]
                    if 'HitStart' not in ventry: ventry['HitStart'] = start
                    ventry['HitEnd'] = end
                    ventry['HitSeq'] += aln
            if ventry: vdb.addEntry(ventry)
            ## Seq Check
            for ventry in vdb.entries():
                #self.bugPrint('^%s$' % ventry['QrySeq'])
                #self.bugPrint('^%s$' % ventry['AlnSeq'])
                #self.bugPrint('^%s$' % ventry['HitSeq'])
                if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']):
                    self.debug(ventry)
                    raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq'])))

            ### ~ [3] Split on introns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.obj['DNAHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F','dna=T'])
            self.obj['ProtHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F'])

            #i# Protein Position Conversion
            if protqry:
                for ventry in vdb.entries():
                    # 1->1, 2->4, 3->7 = 1+3*(n-1)
                    ventry['QryStart'] = 1+3*(ventry['QryStart']-1)
                    if ventry['QrySeq'].startswith('{'):
                        codend = ventry['QrySeq'].find('}')
                        # {X} = phase 2, find = 2
                        if codend == 2: ventry['QryStart'] += 2
                        # {XX} = phase 1, find = 3
                        elif codend == 3: ventry['QryStart'] += 1
                        else: raise ValueError('QrySeq {} bracket mismatch!: %s' % ventry)
                    ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1

            vdb.newKey(['Qry','Rank','Hit','AlnID'])
            for vkey in vdb.dataKeys():
                ventry = vdb.data(vkey)
                #i# Make a combined hitseq to output to fasta
                #># phap_PSETE__EBS10XV2PHAP187.FAXD1_NOTSC.XXX
                hitname = '%s.ex%s %s %s-%s' % (ventry['Qry'],ventry['Rank'],ventry['Hit'],rje.iStr(ventry['HitStart']),rje.iStr(ventry['HitEnd']))
                hitseq = ''
                phase = (ventry['QryStart'] + 2) % 3
                alnx = 1
                vkeyentries = [ventry]
                dirn = 1
                if ventry['HitEnd'] < ventry['HitStart']:
                    dirn = -1
                    ventry['HitStrand'] = '-'
                else: ventry['HitStrand'] = '+'
                for seq in ['HitSeq','QrySeq','AlnSeq']:
                    ventry[seq] = string.replace(ventry[seq],'}','')
                    ventry[seq] = string.replace(ventry[seq],'{','')
                while rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq']):
                    intron = rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq'])[0]
                    x = ventry['QrySeq'].find(intron)
                    y = x + len(intron)
                    intronlen = int(rje.matchExp('(\d+) bp',ventry['AlnSeq'][x:y])[0])
                    #i# Create a new entry of the first exon
                    newentry = rje.combineDict({},ventry)
                    for seq in ['HitSeq','QrySeq','AlnSeq']:
                        newentry[seq] = newentry[seq][:x]
                    newentry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx); alnx += 1
                    newentry['QryEnd'] = newentry['QryStart'] + len(newentry['QrySeq']) - string.count(newentry['QrySeq'],'-') - 1
                    newentry['HitEnd'] = newentry['HitStart'] + (len(newentry['HitSeq']) - string.count(newentry['HitSeq'],'-') - 1) * dirn
                    newentry['Length'] = x
                    newentry['Identity'] = string.count(newentry['AlnSeq'],'|')
                    vkeyentries.append(vdb.addEntry(newentry))
                    hitseq += newentry['HitSeq']
                    #i# Update ventry to be the rest of the hit
                    for seq in ['HitSeq','QrySeq','AlnSeq']:
                        ventry[seq] = ventry[seq][y:]
                    ventry['QryStart'] = newentry['QryEnd'] + 1
                    if protqry: ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1
                    ventry['HitStart'] = newentry['HitEnd'] + intronlen * dirn
                #i# Calculate length and identity of final exon
                ventry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx)
                ventry['Length'] = len(ventry['AlnSeq'])
                ventry['Identity'] = string.count(ventry['AlnSeq'],'|')
                #i# Add sequence hits
                hitname += ' (%d alignment blocks)' % alnx
                hitseq += ventry['HitSeq']
                hitseq = string.replace(hitseq,'-','')
                protseq = rje_sequence.dna2prot('%s%s' % ('N' * phase,hitseq))
                self.obj['ProtHits']._addSeq(hitname,protseq)
                if ventry['HitStart'] > ventry['HitEnd']: hitseq = rje_sequence.reverseComplement(hitseq)
                self.obj['DNAHits']._addSeq(hitname,hitseq)

                #i# Update AlnID for proper float sorting
                for ventry in vkeyentries:
                    (vcore,vx) = string.split(ventry['AlnID'],'.')
                    ventry['AlnID'] = '%s.%s' % (vcore,rje.preZero(int(vx),alnx))
                    #self.debug(ventry)
            vdb.dataFormat({'AlnID':'string'})
            vdb.remakeKeys()
            self.debug(vdb.dataKeys())

            ## Seq Check
            for ventry in vdb.entries():
                #self.bugPrint('^%s$' % ventry['QrySeq'])
                #self.bugPrint('^%s$' % ventry['AlnSeq'])
                #self.bugPrint('^%s$\n' % ventry['HitSeq'])
                if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']):
                    self.debug(ventry)
                    raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq'])))

            udb = self.reduceLocal(byqry=True)
            udb.rename('unique')
            udb.newKey(['Qry','Rank','Hit','AlnID'])
            self.debug(vdb.dataKeys())

            #i# Calculate exon phase
            for ventry in vdb.entries() + udb.entries(): ventry['Phase'] = (ventry['QryStart'] - 1) % 3

            #i# Protein Position Conversion
            if protqry:
                for ventry in vdb.entries():
                    ventry['QryStart'] = (ventry['QryStart']+2)/3
                    ventry['QryEnd'] = (ventry['QryEnd']+2)/3
                for ventry in udb.entries():
                    ventry['QryStart'] = (ventry['QryStart']+2)/3
                    ventry['QryEnd'] = (ventry['QryEnd']+2)/3

            #vdb.remakeKeys()
            return vdb

        except: self.errorLog('%s.alignmentToLocal error' % self.prog()); raise
Exemplo n.º 6
0
    def setup(self):  ### Main class setup method.
        '''Main class setup method.'''
        try:  ### ~ [1] ~ Setup Program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.obj['Prog'] = None
            prog = self.getStrLC('Name')
            if prog in mod:
                i = self.obj['ProgInfo'] = mod[prog].makeInfo()
                self.printLog(
                    '#PROG',
                    '%s V%s: %s' % (i.program, i.version, i.description))
                progcmd = rje.getCmdList(
                    [], info=i) + self.cmd_list + ['newlog=F']
                out = rje.Out(cmd_list=progcmd)
                out.printIntro(i)
                #self.debug(prog); self.debug(progcmd)
                if self.getBool('Help'):
                    progcmd = mod[prog].cmdHelp(i, out, ['help'] + progcmd)
                self.printLog('#CMD',
                              'Full %s CmdList: %s' %
                              (i.program,
                               rje.argString(
                                   rje.tidyArgs(progcmd,
                                                nopath=self.getStrLC('Rest')
                                                and not self.dev(),
                                                purgelist=purgelist))),
                              screen=False)
                #self.debug(prog); self.debug(progcmd)
                ## ~ [1a] ~ Make self.obj['Prog'] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if prog in ['seqlist', 'rje_seqlist']:
                    self.obj['Prog'] = rje_seqlist.SeqList(self.log, progcmd)
                elif prog in ['uniprot', 'rje_uniprot']:
                    self.obj['Prog'] = rje_uniprot.UniProt(self.log, progcmd)
                elif prog in ['taxonomy', 'rje_taxonomy']:
                    self.obj['Prog'] = rje_taxonomy.Taxonomy(self.log, progcmd)
                elif prog in ['tree', 'rje_tree']:
                    self.obj['Prog'] = rje_tree.Tree(self.log, progcmd)
                elif prog in ['xref', 'rje_xref']:
                    self.obj['Prog'] = rje_xref.XRef(self.log, progcmd)
                elif prog in ['seq', 'rje_seq']:
                    self.obj['Prog'] = rje_seq.SeqList(self.log, progcmd)
                elif prog in ['mitab', 'rje_mitab']:
                    self.obj['Prog'] = rje_mitab.MITAB(self.log, progcmd)
                elif prog in ['dbase', 'database']:
                    self.obj['Prog'] = rje_dbase.DatabaseController(
                        self.log, progcmd)
                elif prog in ['pydocs']:
                    self.obj['Prog'] = rje_pydocs.PyDoc(self.log, progcmd)
                elif prog in ['ensembl', 'rje_ensembl']:
                    self.obj['Prog'] = rje_ensembl.EnsEMBL(self.log, progcmd)
                elif prog in ['genbank', 'rje_genbank']:
                    self.obj['Prog'] = rje_genbank.GenBank(self.log, progcmd)
                elif prog in ['extatic']:
                    self.obj['Prog'] = extatic.ExTATIC(self.log, progcmd)
                elif prog in ['revert']:
                    self.obj['Prog'] = revert.REVERT(self.log, progcmd)
                elif prog in ['fiesta']:
                    self.obj['Prog'] = fiesta.FIESTA(self.log, progcmd)
                elif prog in ['gablam']:
                    self.obj['Prog'] = gablam.GABLAM(self.log, progcmd)
                elif prog in ['gopher']:
                    self.obj['Prog'] = gopher.Gopher(self.log, progcmd)
                elif prog in ['haqesac']:
                    self.obj['Prog'] = haqesac.HAQESAC(self.log, progcmd)
                elif prog in ['multihaq']:
                    self.obj['Prog'] = multihaq.MultiHAQ(self.log, progcmd)
                elif prog in ['pingu']:
                    self.obj['Prog'] = pingu.PINGU(self.log, progcmd)
                elif prog in ['pacbio']:
                    self.obj['Prog'] = rje_pacbio.PacBio(self.log, progcmd)
                elif prog in ['rje_zen', 'zen']:
                    self.obj['Prog'] = rje_zen.Zen(self.log, progcmd)

            ### ~ [2] ~ Failure to recognise program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not self.obj['Prog']:
                self.printLog(
                    '#ERR',
                    'Program "%s" not recognised.' % self.getStr('Name'))
                if self.i() < 0: return False
                if rje.yesNo('Show SeqSuite help with program options?'):
                    extracmd = cmdHelp(cmd_list=['help'])[1:]
                    if extracmd:
                        self.cmd_list += extracmd
                        self._cmdList()
                        if prog != self.getStrLC('Name'): return self.setup()
                self.setStr({
                    'Name':
                    rje.choice('Give program name (Blank or CTRL+C to quit)')
                })
                if self.getStrLC('Name'): return self.setup()
                else: return False
            return self.obj['Prog']  # Setup successful
        except KeyboardInterrupt:
            return False
        except SystemExit:
            raise
        except:
            self.errorLog('Problem during %s setup.' % self.prog())
            return False  # Setup failed
Exemplo n.º 7
0
    def makeHTML(self):  ### Generates HTML pages for interactive navigation.
        '''Generates HTML pages for interactive navigation.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            basefile = self.basefile()
            scmd = self.cmd_list + [
                'seqin=%s' % self.getStr('Candidates'), 'autoload=T',
                'autofilter=F', 'seqmode=file'
            ]
            candseq = rje_seqlist.SeqList(self.log, scmd)
            # All files and directories are named after basefile:
            # *.fas = original target PROTEIN sequences (with original descriptions)
            scmd = self.cmd_list + [
                'seqin=%s' % self.getStr('SeqIn'), 'autoload=T',
                'autofilter=F', 'seqmode=file'
            ]
            seqlist = rje_seqlist.SeqList(self.log, scmd)
            # *.gablam.tdt = GABLAM results with match details. (Might have *.hmmer.tdt instead.)
            gdb = self.db().addTable('%s.gablam.tdt' % basefile,
                                     mainkeys=['Qry', 'Hit'],
                                     name='gablam',
                                     expect=False)
            # - Contains candidate proteins as Queries and Target proteins as hits
            # *.HAQESAC/ = directory containing individual HAQESAC runs, named after Hit accnum
            haqdir = rje.makePath('./%s.HAQESAC/' % basefile)

            ### ~ [2] Generate front page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            hfile = '%s.html' % basefile
            hobj = self.obj['HTML']
            hobj.list['StyleSheets'] = [
                'http://www.slimsuite.unsw.edu.au/stylesheets/rje_tabber.css',
                'http://www.slimsuite.unsw.edu.au/stylesheets/slimhtml.css'
            ]
            html = hobj.htmlHead(basefile)
            # Front page should have:
            html += '<h1>%s</h1>\n\n' % basefile
            htabs = []  # (tab_id, tab_html_text[, tab_title])
            # Target protein list (with links to HAQ HTML)
            ctext = '%s\n' % string.join(['Name', 'Descripton', 'Length'],
                                         '\t')
            seqdict = seqlist.makeSeqNameDic('short')
            if gdb: hitlist = gdb.indexKeys('Hit')
            else: hitlist = rje.sortKeys(seqdict)
            for name in hitlist:
                seq = seqdict[name]
                cseq = [
                    name,
                    seqlist.seqDesc(seq),
                    '%s aa' % seqlist.seqLen(seq)
                ]
                acc = seqlist.seqAcc(seq)
                if os.path.exists('%s%s.log' % (haqdir, acc)):
                    cseq[0] = '<a href="%s%s.html">%s</a>' % (haqdir, acc,
                                                              cseq[0])
                ctext += '%s\n' % string.join(cseq, '\t')
            htabs.append(
                ('Hits', rje_html.tableToHTML(ctext, '\t', tabid='parse'),
                 'Target sequences hit by candidates.'))
            # GABLAM/HMM table (with above links)
            if gdb:
                ctext = '%s\n' % string.join(gdb.fields(), '\t')
                for gline in open('%s.gablam.tdt' % basefile,
                                  'r').readlines()[1:]:
                    gdata = string.split(gline, '\t')
                    acc = string.split(gdata[0], '__')[-1]
                    gdata[
                        0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (
                            acc, gdata[0])
                    acc = string.split(gdata[1], '__')[-1]
                    gdata[1] = '<a href="%s%s.html">%s</a>' % (haqdir, acc,
                                                               gdata[1])
                    ctext += '%s\n' % string.join(gdata, '\t')
                htabs.append(
                    ('GABLAM', rje_html.tableToHTML(ctext, '\t',
                                                    tabid='parse'),
                     'GABLAM hit table.'))
            # Candidate list (with DB links)
            if candseq.seqNum():
                ctext = '%s\n' % string.join(
                    ['AccNum', 'ID', 'Descripton', 'Length'], '\t')
                accdict = candseq.makeSeqNameDic('accnum')
                for acc in rje.sortKeys(accdict):
                    seq = accdict[acc]
                    cseq = [
                        acc,
                        candseq.seqID(seq),
                        candseq.seqDesc(seq),
                        '%s aa' % candseq.seqLen(seq)
                    ]
                    cseq[
                        0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (
                            acc, acc)
                    ctext += '%s\n' % string.join(cseq, '\t')
                htabs.append(('Candidates',
                              rje_html.tableToHTML(ctext, '\t', tabid='parse'),
                              'Candidate sequences to search.'))
            html += hobj.tabberHTML('GABLAM', htabs)
            html += hobj.htmlTail()
            open(hfile, 'w').write(html)

            ### ~ [3] Generate sequence-specific pages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #?# Move this to HAQESAC or MultiHAQ
            for i in range(len(hitlist)):
                hit = string.split(hitlist[i], '__')[-1]
                logfile = '%s%s.log' % (haqdir, hit)
                seqbase = logfile[:-4]
                hfile = '%s.html' % seqbase
                html = hobj.htmlHead(seqbase)
                # Front page should have:
                html += '<h1>%s</h1>\n\n' % seqbase
                html += '<p>Click <a href="../%s.html">here</a> to return to results summary. \n' % basefile
                if i:
                    html += 'Previous: <a href="./%s.html"><code>%s</code></a>. \n' % (
                        string.split(hitlist[i - 1], '__')[-1], hitlist[i - 1])
                if i < len(hitlist) - 1:
                    html += 'Next: <a href="./%s.html"><code>%s</code></a>. \n' % (
                        string.split(hitlist[i + 1], '__')[-1], hitlist[i + 1])
                html += '</p>\n'
                htabs = []  # (tab_id, tab_html_text[, tab_title])
                for ftype in ['png', 'tree.txt', 'fas', 'nwk', 'log']:
                    seqfile = '%s.%s' % (seqbase, ftype)
                    if not os.path.exists(seqfile): continue
                    tabtext = '<p><a href="./%s">./%s</a></p>\n' % (
                        os.path.basename(seqfile), os.path.basename(seqfile))
                    if ftype == 'png':
                        tabtext += '<a href="./%s"><img src="%s" width="100%%"></a>\n' % (
                            os.path.basename(seqfile),
                            os.path.basename(seqfile))
                        tabdesc = 'PNG of %s tree.' % seqbase
                    else:
                        tabtext += '<pre>%s</pre>\n' % open(seqfile,
                                                            'r').read()
                        if ftype == 'tree.txt':
                            for xref in hitlist:
                                reptext = '<a href="./%s.html">%s</a>' % (
                                    string.split(xref, '__')[-1], xref)
                                tabtext = string.replace(
                                    tabtext, ': %s ' % xref, ': %s ' % reptext)
                            while rje.matchExp('(: \S+_(\S+)__(\S+) )',
                                               tabtext):
                                (oldtext, sid, spec, spacc) = rje.matchExp(
                                    '(: (\S+)_(\S+)__(\S+) )', tabtext)
                                newtext = ': %s_<a href="http://www.uniprot.org/taxonomy/?query=%s&sort=score" target="_blank">%s</a>__<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a> ' % (
                                    sid, spec, spec, spacc, spacc)
                                tabtext = string.replace(
                                    tabtext, oldtext, newtext)
                        tabdesc = '%s output' % seqfile
                    htabs.append((ftype, tabtext, tabdesc))
                if htabs:
                    html += hobj.tabberHTML(os.path.basename(seqbase), htabs)
                else:
                    html += '<p><i>No output found for <code>%s</code>!</i></p>\n' % hit
                html += hobj.htmlTail()
                open(hfile, 'w').write(html)
        except:
            self.errorLog('Problem with %s.makeHTML()' % self.prog())
Exemplo n.º 8
0
 def seqSubset2(
     self
 ):  ### Extracts sequence subset from MOUSE cDNA and Peptide libraries
     '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if os.path.exists('%s.map.tdt' % self.baseFile()):
             mdb = self.db().addTable('%s.map.tdt' % self.baseFile(),
                                      mainkeys=['Ingolia'],
                                      name='map')
         else:
             ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt'
             xref = db.addTable(xfile, mainkeys=['Gene'], name='xref')
             afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt'
             self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list)
             #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle')
             self.obj['Map'].loadData(
                 ['sourcedata=%s' % xfile,
                  'aliases=%s' % afile])
             ing_genes = string.split(
                 string.join(
                     self.db('starts').index('Gene').keys()).upper())
             map = self.obj['Map']
             ing_map = {}
             for gene in ing_genes:
                 ing_map[gene] = map.bestMap(gene)
             ing_mgi = rje.sortUnique(ing_map.values())
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_mgi)))
             xdb = self.db('xref')
             bad_genes = []
             for gene in ing_mgi[0:]:
                 if gene not in xdb.data():
                     self.printLog(
                         '#MAP',
                         'Cannot map gene "%s" from Ingolia data!' % gene)
                     bad_genes.append(gene)
                     ing_mgi.remove(gene)
             self.printLog(
                 '#BAD', 'Failed to map %s genes from Ignolia' %
                 rje.iLen(bad_genes))
             open('ingolia.bad.txt', 'w').write(string.join(bad_genes))
             ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             ing_musg = xdb.dataList(xdb.entryList(ing_mgi),
                                     'EnsEMBL',
                                     sortunique=True)
             if '' in ing_musg: ing_musg.remove('')
             self.printLog(
                 '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' %
                 (rje.iLen(ing_genes), rje.iLen(ing_musg)))
             if not ing_musg: raise ValueError
             self.deBug(ing_musg[:10])
             for stype in ['cdna', 'pep']:
                 seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype
                 if self.getBool('Force') or not os.path.exists(seqfile):
                     seqout = 'Ingolia.%s.all.fa' % stype
                     seqcmd = self.cmd_list + [
                         'seqin=%s' % seqfile,
                         'seqout=%s' % seqout, 'autofilter=T', 'autload=T',
                         'seqmode=file',
                         'gooddesc=%s' % string.join(ing_musg, ',')
                     ]
                     rje_seqlist.SeqList(self.log, seqcmd)
             mdb = self.db().addEmptyTable('map',
                                           ['Ingolia', 'Gene', 'EnsEMBL'],
                                           ['Ignolia'])
             for gene in ing_map:
                 entry = {'Ingolia': gene, 'Gene': ing_map[gene]}
                 if entry['Gene'] in bad_genes: entry['EnsEMBL'] = ''
                 else:
                     entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL']
                 mdb.addEntry(entry)
         seqfile = 'Ingolia.cdna.all.fa'
         seqcmd = self.cmd_list + [
             'seqin=%s' % seqfile, 'autofilter=F', 'autload=T',
             'seqmode=file'
         ]
         iseq = rje_seqlist.SeqList(self.log, seqcmd)
         if 'ENST' not in mdb.fields():
             mdb.addField('ENST', evalue='')
             while iseq.nextSeq():
                 (iname, icdna) = iseq.getSeq()
                 musg = rje.matchExp('gene:(\S+)', iname)[0]
                 for entry in mdb.indexEntries('EnsEMBL', musg):
                     if entry['ENST']:
                         entry['ENST'] += ',%s' % string.split(iname)[0]
                     else:
                         entry['ENST'] = string.split(iname)[0]
             mdb.saveToFile()
         ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sdb = self.db('starts')
         sdb.dataFormat({'Init Codon [nt]': 'int'})
         icod = 'Init Codon [nt]'
         icon = 'Init Context [-3 to +4]'
         sdb.info['Name'] = 'mapped_start'
         sdb.addField('ENST')
         sdb.addField('ENSP')
         sdb.addField('ENSI')
         ENST = open('IngExact.cdna.all.fa', 'w')
         ENSP = open('IngExact.pep.all.fa', 'w')
         ex = 0.0
         etot = sdb.entryNum()
         sx = 0
         fx = 0
         minpep = 20
         for entry in sdb.entries():
             self.progLog(
                 '\r#ING',
                 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot))
             ex += 100.0
             #self.deBug(entry)
             entry[icon] = entry[icon].upper()
             gene = entry['Gene'].upper()
             mentry = mdb.data(gene)
             entry['ENST'] = entry['ENSI'] = ''
             cdnaseq = peptseq = ''
             if not mentry or not mentry['ENST']:
                 fx += 1
                 continue
             #self.deBug(mentry)
             mtype = 'fail'
             for trans in string.split(mentry['ENST'], ','):
                 (tname, tseq) = iseq.getDictSeq(trans, format='tuple')
                 self.deBug('%s vs %s' %
                            (tseq[entry[icod] - 3:][:7], entry[icon]))
                 if tseq[entry[icod] - 3:][:7] == entry[icon]:
                     ipept = string.split(
                         rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0]
                     self.deBug(ipept)
                     if len(ipept) > len(peptseq):
                         entry['ENST'] = trans
                         cdnaseq = tseq
                         peptseq = ipept
                         mtype = 'exact'
             if not entry['ENST']:
                 self.printLog(
                     '\r#ING',
                     'Unable to find Harrington start for %s %s (%s)' %
                     (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             elif len(peptseq) < minpep:
                 self.printLog(
                     '\r#ING',
                     'Peptide from mapped Harrington start for %s %s (%s) too short!'
                     % (gene, entry[icod], entry[icon]),
                     screen=False)
                 fx += 1
                 continue
             id = rje.preZero(int(ex / 100), etot)
             entry['ENSI'] = 'ENSINGT%s' % id
             entry['ENSP'] = 'ENSINGP%s' % id
             ENST.write(
                 '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'],
                    entry['Gene'], mentry['Gene'], cdnaseq))
             ENSP.write(
                 '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n'
                 % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id,
                    entry['Gene'], mentry['Gene'], peptseq))
             sx += 1
         sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile())
         ENST.close()
         ENSP.close()
         self.printLog(
             '\r#ING',
             'Output %s Ingolia peptides and transcripts. %s failed.' %
             (rje.iStr(sx), rje.iStr(fx)))
         return
     except:
         self.errorLog('%s.method error' % self)