def run(self,imenu=False,outputmap=True,returndict=False): ### Main controlling run Method ''' Main controlling run Method. >> imenu:boolean = Whether to initiate interactive menu if appropriate [False]. >> outputmap:boolean = Whether to output mapping into a file [True] >> returndict:boolean = Whether to return a dictionary of {searchname:mappedname} (no previous mapping) [False] ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.setup(imenu): raise ValueError seqlist = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=T','seqmode=file']) if not seqlist.seqNum(): self.warnLog('No sequences loaded for mapping.'); return {} ## ~ [0a] Setup BLAST Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## blast = rje_blast.BLASTRun(self.log,['blaste=1e-4','blastv=20','blastf=F']+self.cmd_list+['v=-1']) blast.setStr({'DBase':self.getStr('MapDB'),'Type':'blastp','InFile':self.getStr('SeqIn'), 'Name':'%s-%s.blast' % (rje.baseFile(self.str['SeqIn'],True),rje.baseFile(self.str['MapDB'],True))}) blast.setStat({'HitAln':blast.getStat('OneLine')}) blast.list['ResTab'] = ['Search','Hit','GABLAM'] if seqlist.nt(): blast.str['Type'] = 'blastx' ## ~ [0b] Setup Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if outputmap: self._setupOutput() ## Output Files ## if returndict: mapdict = {} else: self._setupMapped() ## Previously Mapped Sequences ## seqx = seqlist.seqNum() ## Number of sequences ## ### ~ [1] BLAST Search Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#BLAST','BLASTing %s vs %s.\n *** This could take some time if files are large. Please be patient! ***' % (self.str['SeqIn'],self.str['MapDB']),log=False) ## ~ [1a] Perform BLAST Unless it exists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## blast.run(format=True) self.obj['DB'] = blast.obj['DB'] ## ~ [1b] Mapping from searches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.debug(self.getStr('MapDB')) self.obj['MapDB'] = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=F','seqmode=file','seqin=%s' % self.str['MapDB']]) self.obj['MapDB'].loadSeq(self.getStr('MapDB')) self.debug('%s' % self.obj['MapDB'].list['Seq']) sx = 0 while seqlist.nextSeq() != None: search = seqlist.getSeq(format='short') sx += 1 ## Check StartFrom ## if self.str['StartFrom']: if self.str['StartFrom'] != search: self.progLog('\r#SKIP','Looking for %s: skipping %d seqs' % (self.str['StartFrom'],sx)) continue self.str['StartFrom'] = '' self.printLog('\r#SKIP','Starting from %s: skipped %d seqs' % (self.str['StartFrom'],sx)) ## Check if in Mapped ## if search in self.list['Mapped']: resdict = {'Query':search,'Hit':search,'Method':'Already Mapped!'} self.printLog('#FAS','%s already in output - not duplicating in %s' % (search,self.str['MapFas'])) rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict) continue ### Map Sequence ### self.printLog('#MAP','Mapping %s seqs: %s of %s' % (self.str['SeqIn'],rje.integerString(sx),rje.integerString(seqx))) mapname = self.mapSeq(seqlist,blast,search) if returndict: mapdict[search] = mapname ### ~ [2] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#MAP','Mapping of %s (%s seqs) complete.' % (self.str['SeqIn'],rje.integerString(seqx))) if os.path.exists(blast.str['Name']) and not (self.getBool('DeBug') or self.test()): os.unlink(blast.str['Name']) #!# Add option to keep BLAST! #!# if returndict: return mapdict except: self.errorLog('Error in SeqMapper.run()',printerror=True,quitchoice=True); raise
def seqinObj(self, summarise=True, gapstats=True ): ### Returns the a SeqList object for the SeqIn file ''' Returns the a SeqList object for the SeqIn file. :return: self.obj['SeqIn'] ''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqbase = rje.baseFile(self.getStr('SeqIn'), strip_path=True) if not self.obj['SeqIn']: seqcmd = self.cmd_list if summarise: seqcmd += ['summarise=T', 'dna=T', 'raw=F'] gapstats = gapstats and ( self.force() or not rje.exists('%s.gaps.tdt' % seqbase)) if gapstats: seqcmd += ['gapstats'] self.obj['SeqIn'] = rje_seqlist.SeqList( self.log, seqcmd + ['autoload=T', 'seqmode=file', 'autofilter=F']) # sx = 0.0; stot = self.obj['SeqIn'].seqNum() # for seq in self.obj['SeqIn'].seqs(): # self.progLog('\r#CHECK','Checking sequences names: %.1f%%' % (sx/stot)); sx += 100.0 # if '|' in self.obj['SeqIn'].shortName(seq): # raise ValueError('Pipe "|" characters found in seqin=FILE names: will break program. Please rename and try again.') # self.printLog('\r#CHECK','Checking sequences names complete.') except ValueError: self.printLog('\r#CHECK', 'Checking sequences names aborted.') self.errorLog('DepthCharge input sequence error') raise except: self.errorLog('DepthCharge.seqinObj() error') return self.obj['SeqIn']
def batchSummarise( self ): ### Batch run seqlist summarise on batchrun=LIST files and output table of results ''' Batch run seqlist summarise on batchrun=LIST files and output table of results ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.list['BatchRun']: raise ValueError( 'Need to provide batchrun=LIST files for summarise mode.') db = rje_db.Database(self.log, self.cmd_list) self.printLog('#BASE', db.baseFile()) sdb = None if not self.force(): sdb = db.addTable(mainkeys=['File'], name='summarise', expect=False) if not sdb: sdb = db.addEmptyTable('summarise', ['File'], ['File']) ### ~ [2] Run Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog( '#BATCH', 'Batch summarising %s input files' % rje.iLen(self.list['BatchRun'])) for file in self.list['BatchRun']: seqdata = rje_seqlist.SeqList( self.log, self.cmd_list + ['seqin=%s' % file, 'autoload=T', 'summarise=F' ]).summarise() if seqdata: if 'GC' in seqdata: seqdata.pop('GC') seqdata['GCPC'] = '%.2f' % seqdata['GCPC'] if 'GapLength' in seqdata: seqdata['GapPC'] = '%.2f' % (100.0 * seqdata['GapLength'] / seqdata['TotLength']) seqdata['MeanLength'] = '%.1f' % seqdata['MeanLength'] for field in string.split( 'SeqNum, TotLength, MinLength, MaxLength, MeanLength, MedLength, N50Length, L50Count, GapLength, GapPC, GCPC', ', '): if field in seqdata and field not in sdb.fields(): sdb.addField(field) for field in seqdata.keys(): if field not in sdb.fields(): sdb.addField(field) sdb.addEntry(seqdata) else: self.errorLog('Summarise failed for %s' % file, printerror=False) ### ~ [3] Output Summarise ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb.saveToFile() return True except: self.errorLog('%s.batchSummarise error' % self) return False
def inSilicoHybrid( self ): ### Filter and combine subreads from parent and output to fasta file. ''' Filter and combine subreads from parent and output to fasta file. This module generates balanced "in silico diploid" PacBio subread data from two sequenced haploid parents. Each parent must first be run through SMRTSCAPE to generate subread summary data. (This will be performed if missing. Each parent needs a `*.fofn` file of subread file names, `*.unique.tdt` unique subreads table and `*.smrt.tdt` SMRT cell identifier table.) A new set of subreads is then generated from the combined set of parent subreads. This is done by first ranking the unique subreads from each parent by length. First, the longest subread from each parent are compared and the shortest selected to be the first subread of the diploid. (The shortest is taken to minimise length differences between the two parents.) Next, the longest subread from the next parent that is no longer than the previous subread is added. This cycles, picking a read from the the parent with fewest cumulative bases each cycle. The longest subread that is no longer than the previous subread is selected. This continues until one parent runs out of subreads. Additional subreads will be added from the other parent if they reduce the difference in cumulative output for each parent. Final output will be a `*.subreads.fasta` file in which each parent has a similar total sequence content and for which the subread length distributions should also be similar. This is to overcome biases in resulting diploid assemblies, where one parent has higher quality data than the other. NOTE: If performing downstream filtering by Read Quality (RQ), this might reintroduce a bias if one parent has much higher RQ values than the other. The `rqfilter=X` setting can therefore be used to restrict output to reads with a minimum RQ value. By default this is 0.84. If you do not get enough sequence output, this setting may need to be relaxed. ''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Parent 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 1 ~~~~~~~~~~~~~~~~~~~~ #') self.printLog('#FOFN', 'Parent1: %s' % self.getStr('Parent1')) base1 = rje.baseFile(self.getStr('Parent1')) parent1 = smrtscape.SMRTSCAPE( self.log, ['genomesize=13.1e6'] + self.cmd_list + ['batch=%s' % self.getStr('Parent1'), 'basefile=%s' % base1]) parent1.setup() udb1 = parent1.udb() cdb = parent1.db('smrt', add=True, mainkeys=['Name']) cdb.dataFormat({'SMRT': 'int'}) cx = cdb.entryNum() ## ~ [0a] Parent 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 2 ~~~~~~~~~~~~~~~~~~~~ #') self.printLog('#FOFN', 'Parent2: %s' % self.getStr('Parent2')) base2 = rje.baseFile(self.getStr('Parent2')) parent2 = smrtscape.SMRTSCAPE( self.log, ['genomesize=13.1e6'] + self.cmd_list + ['batch=%s' % self.getStr('Parent2'), 'basefile=%s' % base2]) parent2.setup() udb2 = parent2.udb() cdb2 = parent2.db('smrt', add=True, mainkeys=['Name']) cdb2.dataFormat({'SMRT': 'int'}) # Shift all of the Parent2 SMRT IDs to avoid conflict with Parent1 for entry in cdb2.entries() + udb2.entries(): entry['SMRT'] = entry['SMRT'] + cx cdb = parent1.db().mergeTables(cdb, cdb2) ## ~ [0c] Output Sequence File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ DIPLOIDOCUS SUBREADS ~~~~~~~~~~~~~~~~~~~~ #' ) minlen = self.getInt('LenFilter') minrq = self.getNum('RQFilter') rqstr = '%s' % minrq filtfile = '%s.L%sRQ%s.fasta' % (self.baseFile(), minlen, rqstr[2:]) ## ~ [0d] Input Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqbatch = [] # List of SeqList objects self.printLog( '#BATCH', '%s sequence files to process.' % rje.iLen(parent1.list['Batch'] + parent2.list['Batch'])) for seqfile in parent1.list['Batch'] + parent2.list['Batch']: seqcmd = self.cmd_list + [ 'seqmode=file', 'autoload=T', 'summarise=F', 'seqin=%s' % seqfile, 'autofilter=F' ] seqbatch.append(rje_seqlist.SeqList(self.log, seqcmd)) self.printLog( '#BATCH', '%s sequence files to summarise.' % rje.iLen(seqbatch)) if not seqbatch: raise IOError( 'No batch input fasta files found! Make sure parentN=FILE settings given *.fofn.' ) ## ~ [0e] Setup subread lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elists = [ udb1.sortedEntries('Len', reverse=True), udb2.sortedEntries('Len', reverse=True) ] plen = [0, 0] # Summed lengths for each parent pseq = [0, 0] # Total sequence number for each parent prq = [0, 0] # Total sequence RQ for each parent (convert to mean) if not elists[0] or not elists[1]: raise ValueError( 'No Unique ZMW subreads for one or both parents!') lastlen = max(elists[0][0]['Len'], elists[1][0]['Len']) # Length of last selected read for elist in elists: while elist and elist[0]['RQ'] < minrq: elist.pop(0) if not elists[0] or not elists[1]: raise ValueError( 'No Unique ZMW subreads for one or both parents!') nextp = 0 # Index of next parent to use if elists[0][0]['Len'] < elists[1][0]['Len']: nextp = 1 ### ~ [1] Filter and Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Filter Unique Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## zmwlist = [] # List of (smrt,zmw) meeting filtering criteria ux = 0.0 utot = len(elists[0]) + len(elists[1]) while lastlen: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) elist = elists[nextp] while elist and elist[0]['RQ'] < minrq: elist.pop(0) ux += 100.0 if elist and elist[0]['Len'] < minlen: ux += 100.0 * len(elist) elist = [] if not elist: nextp = 1 - nextp break # Finish entry = elist.pop(0) ux += 100.0 zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos'])) plen[nextp] += entry['Len'] prq[nextp] += entry['RQ'] pseq[nextp] += 1 if plen[1 - nextp] <= plen[nextp]: nextp = 1 - nextp lastlen = entry['Len'] ## ~ [1b] Final processing of last reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## while elists[nextp]: elist = elists[nextp] while elist and elist[0]['RQ'] < minrq: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) elist.pop(0) ux += 100.0 while elist and elist[0]['Len'] >= minlen: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) entry = elist.pop(0) ux += 100.0 pdiff = rje.modulus(plen[0] - plen[1]) ediff = rje.modulus(plen[nextp] + entry['Len'] - plen[1 - nextp]) if ediff >= pdiff: elists[nextp] = [] break #Finish! zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos'])) plen[nextp] += entry['Len'] prq[nextp] += entry['RQ'] pseq[nextp] += 1 self.printLog( '\r#DIP', 'Diploidising subreads complete: %s subreads to output.' % rje.iLen(zmwlist)) self.printLog( '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' % (self.getStr('Parent1'), rje.iStr(pseq[0]), rje.iStr(plen[0]), 1.0 * plen[0] / self.getInt('GenomeSize'), prq[0] / pseq[0])) self.printLog( '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' % (self.getStr('Parent2'), rje.iStr(pseq[1]), rje.iStr(plen[1]), 1.0 * plen[1] / self.getInt('GenomeSize'), prq[1] / pseq[1])) ## ~ [1b] Extract Filtered Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## rje.backup(self, filtfile) SEQOUT = open(filtfile, 'w') sx = 0.0 stot = 0 sn = len(seqbatch) fx = 0 for seqlist in seqbatch: #>m150625_001530_42272_c100792502550000001823157609091582_s1_p0/9/0_3967 RQ=0.784 si = 100.0 / seqlist.seqNum() stot += seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#OUT', 'Extracting subreads: %.2f%%' % (sx / sn)) sx += si (name, sequence) = seqlist.getSeq(seq) try: [smrt, zmw, pos, rq] = string.split(string.replace(name, '/', ' ')) except: [smrt, zmw, pos] = string.split(string.replace(name, '/', ' ')) rq = minrq if (cdb.data(smrt)['SMRT'], int(zmw), pos) not in zmwlist: continue SEQOUT.write('>%s\n%s\n' % (name, sequence)) fx += 1 self.printLog( '\r#OUT', 'Saved %s filtered subreads to %s.' % (rje.iStr(fx), filtfile)) ### ~ [2] Summarise Filtered File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqcmd = self.cmd_list + [ 'seqmode=file', 'autoload=T', 'summarise=T', 'seqin=%s' % filtfile, 'autofilter=F' ] rje_seqlist.SeqList(self.log, seqcmd) return True except: self.errorLog('%s.run error' % self.prog()) return False
def alignmentToLocal(self,alignment=[],protqry=False): ### Converts alignment into local hits table ''' Converts alignment into local hits table. >> alignment:list of alignment text strings parsed from exonerate output. >> protqry:bool[False] = Whether query is protein << returns local database table. ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### vfields = ['Qry','Hit','AlnID','Score','Expect','Length','Identity','Positives','QryStart','QryEnd','HitStart','HitEnd','QrySeq','HitSeq','AlnSeq','Rank','Phase','HitStrand'] vdb = self.db().addEmptyTable('local',vfields,['Qry','Hit','AlnID']) ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ''' Query: FAXD1_NOTSC (P82807) Venom prothrombin activator notecarin-D1 [Notechis scutatus scutatus] Target: ahap_PSETE__EBS10XV2AHAP187 haploidB edges=694320..157489 left=833615 right=281503 ver=1.9 style=4:[revcomp] Model: protein2genome:local Raw score: 1170 Query range: 19 -> 295 Target range: 12312786 -> 12307250 20 : AlaGluSerAsnValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg : 37 ..!...||| |||||||||||||||||||||||||||||||||||||||||| CysSerSerLeuValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg 12312786 : TGTTCTTCTTTAGTATTCTTAAAAAGCAAAGTGGCAAATAGATTTTTGCAAAGA : 12312735 264 : {G} >>>> Target Intron 7 >>>> {ly}GluIleAspIleSerArg : 270 {|} 1304 bp {||}|||||||||||||||!!! {G}++ ++{ly}GluIleAspIleSerSer 12308652 : {G}gt.........................ag{GG}GAAATAGACATATCAAGC : 12307328 289 : ValProProAsnTyrTyrTyr : 295 |||||| !!!..||| !!||| ValProAlaThrTyrAspTyr 12307273 : GTTCCTGCCACGTATGACTAT : 12307251 ''' qry = None hit = None alnx = {} ventry = {} parsing = alignment[0:] rank = 1 while parsing: line = parsing.pop(0) #self.bugPrint(line) # Query if rje.matchExp('Query: (\S+)',line): if ventry: vdb.addEntry(ventry) ventry = {'Qry':rje.matchExp('Query: (\S+)',line)[0],'QrySeq':'','HitSeq':'','AlnSeq':'','Rank':rank} rank += 1 # Hit if rje.matchExp('Target: (\S+)',line): ventry['Hit'] = rje.matchExp('Target: (\S+)',line)[0] qh = (ventry['Qry'],ventry['Hit']) if qh in alnx: alnx[qh] += 1 else: alnx[qh] = 1 ventry['AlnID'] = alnx[qh] # Score if rje.matchExp('core: (\S+)',line): ventry['Score'] = int(rje.matchExp('core: (\S+)',line)[0]) # Alignment if rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line): adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line) #self.bugPrint('= new aln: %s -> %s' % (adata[0],adata[2])) start = int(adata[0]) end = int(adata[2]) aln = adata[1] x = line.find(aln) if 'QryStart' not in ventry: ventry['QryStart'] = start ventry['QryEnd'] = end ventry['QrySeq'] += aln #self.bugPrint('^%s$' % ventry['QrySeq']) line = parsing.pop(0) #self.bugPrint(line) #self.bugPrint(']%s[' % aln) #self.bugPrint(']%s[' % line[x:x+len(aln)]) ventry['AlnSeq'] += line[x:x+len(aln)] #self.debug('^%s$' % ventry['AlnSeq']) #self.bugPrint(parsing[0]) adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0)) if not adata: #self.deBug(parsing[0]) adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0)) if not adata: raise ValueError('Partial alignment! Truncated output?') #self.bugPrint('+ hit aln: %s -> %s' % (adata[0],adata[2])) start = int(adata[0]) end = int(adata[2]) aln = adata[1] if 'HitStart' not in ventry: ventry['HitStart'] = start ventry['HitEnd'] = end ventry['HitSeq'] += aln if ventry: vdb.addEntry(ventry) ## Seq Check for ventry in vdb.entries(): #self.bugPrint('^%s$' % ventry['QrySeq']) #self.bugPrint('^%s$' % ventry['AlnSeq']) #self.bugPrint('^%s$' % ventry['HitSeq']) if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']): self.debug(ventry) raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq']))) ### ~ [3] Split on introns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DNAHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F','dna=T']) self.obj['ProtHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F']) #i# Protein Position Conversion if protqry: for ventry in vdb.entries(): # 1->1, 2->4, 3->7 = 1+3*(n-1) ventry['QryStart'] = 1+3*(ventry['QryStart']-1) if ventry['QrySeq'].startswith('{'): codend = ventry['QrySeq'].find('}') # {X} = phase 2, find = 2 if codend == 2: ventry['QryStart'] += 2 # {XX} = phase 1, find = 3 elif codend == 3: ventry['QryStart'] += 1 else: raise ValueError('QrySeq {} bracket mismatch!: %s' % ventry) ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1 vdb.newKey(['Qry','Rank','Hit','AlnID']) for vkey in vdb.dataKeys(): ventry = vdb.data(vkey) #i# Make a combined hitseq to output to fasta #># phap_PSETE__EBS10XV2PHAP187.FAXD1_NOTSC.XXX hitname = '%s.ex%s %s %s-%s' % (ventry['Qry'],ventry['Rank'],ventry['Hit'],rje.iStr(ventry['HitStart']),rje.iStr(ventry['HitEnd'])) hitseq = '' phase = (ventry['QryStart'] + 2) % 3 alnx = 1 vkeyentries = [ventry] dirn = 1 if ventry['HitEnd'] < ventry['HitStart']: dirn = -1 ventry['HitStrand'] = '-' else: ventry['HitStrand'] = '+' for seq in ['HitSeq','QrySeq','AlnSeq']: ventry[seq] = string.replace(ventry[seq],'}','') ventry[seq] = string.replace(ventry[seq],'{','') while rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq']): intron = rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq'])[0] x = ventry['QrySeq'].find(intron) y = x + len(intron) intronlen = int(rje.matchExp('(\d+) bp',ventry['AlnSeq'][x:y])[0]) #i# Create a new entry of the first exon newentry = rje.combineDict({},ventry) for seq in ['HitSeq','QrySeq','AlnSeq']: newentry[seq] = newentry[seq][:x] newentry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx); alnx += 1 newentry['QryEnd'] = newentry['QryStart'] + len(newentry['QrySeq']) - string.count(newentry['QrySeq'],'-') - 1 newentry['HitEnd'] = newentry['HitStart'] + (len(newentry['HitSeq']) - string.count(newentry['HitSeq'],'-') - 1) * dirn newentry['Length'] = x newentry['Identity'] = string.count(newentry['AlnSeq'],'|') vkeyentries.append(vdb.addEntry(newentry)) hitseq += newentry['HitSeq'] #i# Update ventry to be the rest of the hit for seq in ['HitSeq','QrySeq','AlnSeq']: ventry[seq] = ventry[seq][y:] ventry['QryStart'] = newentry['QryEnd'] + 1 if protqry: ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1 ventry['HitStart'] = newentry['HitEnd'] + intronlen * dirn #i# Calculate length and identity of final exon ventry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx) ventry['Length'] = len(ventry['AlnSeq']) ventry['Identity'] = string.count(ventry['AlnSeq'],'|') #i# Add sequence hits hitname += ' (%d alignment blocks)' % alnx hitseq += ventry['HitSeq'] hitseq = string.replace(hitseq,'-','') protseq = rje_sequence.dna2prot('%s%s' % ('N' * phase,hitseq)) self.obj['ProtHits']._addSeq(hitname,protseq) if ventry['HitStart'] > ventry['HitEnd']: hitseq = rje_sequence.reverseComplement(hitseq) self.obj['DNAHits']._addSeq(hitname,hitseq) #i# Update AlnID for proper float sorting for ventry in vkeyentries: (vcore,vx) = string.split(ventry['AlnID'],'.') ventry['AlnID'] = '%s.%s' % (vcore,rje.preZero(int(vx),alnx)) #self.debug(ventry) vdb.dataFormat({'AlnID':'string'}) vdb.remakeKeys() self.debug(vdb.dataKeys()) ## Seq Check for ventry in vdb.entries(): #self.bugPrint('^%s$' % ventry['QrySeq']) #self.bugPrint('^%s$' % ventry['AlnSeq']) #self.bugPrint('^%s$\n' % ventry['HitSeq']) if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']): self.debug(ventry) raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq']))) udb = self.reduceLocal(byqry=True) udb.rename('unique') udb.newKey(['Qry','Rank','Hit','AlnID']) self.debug(vdb.dataKeys()) #i# Calculate exon phase for ventry in vdb.entries() + udb.entries(): ventry['Phase'] = (ventry['QryStart'] - 1) % 3 #i# Protein Position Conversion if protqry: for ventry in vdb.entries(): ventry['QryStart'] = (ventry['QryStart']+2)/3 ventry['QryEnd'] = (ventry['QryEnd']+2)/3 for ventry in udb.entries(): ventry['QryStart'] = (ventry['QryStart']+2)/3 ventry['QryEnd'] = (ventry['QryEnd']+2)/3 #vdb.remakeKeys() return vdb except: self.errorLog('%s.alignmentToLocal error' % self.prog()); raise
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] ~ Setup Program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['Prog'] = None prog = self.getStrLC('Name') if prog in mod: i = self.obj['ProgInfo'] = mod[prog].makeInfo() self.printLog( '#PROG', '%s V%s: %s' % (i.program, i.version, i.description)) progcmd = rje.getCmdList( [], info=i) + self.cmd_list + ['newlog=F'] out = rje.Out(cmd_list=progcmd) out.printIntro(i) #self.debug(prog); self.debug(progcmd) if self.getBool('Help'): progcmd = mod[prog].cmdHelp(i, out, ['help'] + progcmd) self.printLog('#CMD', 'Full %s CmdList: %s' % (i.program, rje.argString( rje.tidyArgs(progcmd, nopath=self.getStrLC('Rest') and not self.dev(), purgelist=purgelist))), screen=False) #self.debug(prog); self.debug(progcmd) ## ~ [1a] ~ Make self.obj['Prog'] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if prog in ['seqlist', 'rje_seqlist']: self.obj['Prog'] = rje_seqlist.SeqList(self.log, progcmd) elif prog in ['uniprot', 'rje_uniprot']: self.obj['Prog'] = rje_uniprot.UniProt(self.log, progcmd) elif prog in ['taxonomy', 'rje_taxonomy']: self.obj['Prog'] = rje_taxonomy.Taxonomy(self.log, progcmd) elif prog in ['tree', 'rje_tree']: self.obj['Prog'] = rje_tree.Tree(self.log, progcmd) elif prog in ['xref', 'rje_xref']: self.obj['Prog'] = rje_xref.XRef(self.log, progcmd) elif prog in ['seq', 'rje_seq']: self.obj['Prog'] = rje_seq.SeqList(self.log, progcmd) elif prog in ['mitab', 'rje_mitab']: self.obj['Prog'] = rje_mitab.MITAB(self.log, progcmd) elif prog in ['dbase', 'database']: self.obj['Prog'] = rje_dbase.DatabaseController( self.log, progcmd) elif prog in ['pydocs']: self.obj['Prog'] = rje_pydocs.PyDoc(self.log, progcmd) elif prog in ['ensembl', 'rje_ensembl']: self.obj['Prog'] = rje_ensembl.EnsEMBL(self.log, progcmd) elif prog in ['genbank', 'rje_genbank']: self.obj['Prog'] = rje_genbank.GenBank(self.log, progcmd) elif prog in ['extatic']: self.obj['Prog'] = extatic.ExTATIC(self.log, progcmd) elif prog in ['revert']: self.obj['Prog'] = revert.REVERT(self.log, progcmd) elif prog in ['fiesta']: self.obj['Prog'] = fiesta.FIESTA(self.log, progcmd) elif prog in ['gablam']: self.obj['Prog'] = gablam.GABLAM(self.log, progcmd) elif prog in ['gopher']: self.obj['Prog'] = gopher.Gopher(self.log, progcmd) elif prog in ['haqesac']: self.obj['Prog'] = haqesac.HAQESAC(self.log, progcmd) elif prog in ['multihaq']: self.obj['Prog'] = multihaq.MultiHAQ(self.log, progcmd) elif prog in ['pingu']: self.obj['Prog'] = pingu.PINGU(self.log, progcmd) elif prog in ['pacbio']: self.obj['Prog'] = rje_pacbio.PacBio(self.log, progcmd) elif prog in ['rje_zen', 'zen']: self.obj['Prog'] = rje_zen.Zen(self.log, progcmd) ### ~ [2] ~ Failure to recognise program ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.obj['Prog']: self.printLog( '#ERR', 'Program "%s" not recognised.' % self.getStr('Name')) if self.i() < 0: return False if rje.yesNo('Show SeqSuite help with program options?'): extracmd = cmdHelp(cmd_list=['help'])[1:] if extracmd: self.cmd_list += extracmd self._cmdList() if prog != self.getStrLC('Name'): return self.setup() self.setStr({ 'Name': rje.choice('Give program name (Blank or CTRL+C to quit)') }) if self.getStrLC('Name'): return self.setup() else: return False return self.obj['Prog'] # Setup successful except KeyboardInterrupt: return False except SystemExit: raise except: self.errorLog('Problem during %s setup.' % self.prog()) return False # Setup failed
def makeHTML(self): ### Generates HTML pages for interactive navigation. '''Generates HTML pages for interactive navigation.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### basefile = self.basefile() scmd = self.cmd_list + [ 'seqin=%s' % self.getStr('Candidates'), 'autoload=T', 'autofilter=F', 'seqmode=file' ] candseq = rje_seqlist.SeqList(self.log, scmd) # All files and directories are named after basefile: # *.fas = original target PROTEIN sequences (with original descriptions) scmd = self.cmd_list + [ 'seqin=%s' % self.getStr('SeqIn'), 'autoload=T', 'autofilter=F', 'seqmode=file' ] seqlist = rje_seqlist.SeqList(self.log, scmd) # *.gablam.tdt = GABLAM results with match details. (Might have *.hmmer.tdt instead.) gdb = self.db().addTable('%s.gablam.tdt' % basefile, mainkeys=['Qry', 'Hit'], name='gablam', expect=False) # - Contains candidate proteins as Queries and Target proteins as hits # *.HAQESAC/ = directory containing individual HAQESAC runs, named after Hit accnum haqdir = rje.makePath('./%s.HAQESAC/' % basefile) ### ~ [2] Generate front page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hfile = '%s.html' % basefile hobj = self.obj['HTML'] hobj.list['StyleSheets'] = [ 'http://www.slimsuite.unsw.edu.au/stylesheets/rje_tabber.css', 'http://www.slimsuite.unsw.edu.au/stylesheets/slimhtml.css' ] html = hobj.htmlHead(basefile) # Front page should have: html += '<h1>%s</h1>\n\n' % basefile htabs = [] # (tab_id, tab_html_text[, tab_title]) # Target protein list (with links to HAQ HTML) ctext = '%s\n' % string.join(['Name', 'Descripton', 'Length'], '\t') seqdict = seqlist.makeSeqNameDic('short') if gdb: hitlist = gdb.indexKeys('Hit') else: hitlist = rje.sortKeys(seqdict) for name in hitlist: seq = seqdict[name] cseq = [ name, seqlist.seqDesc(seq), '%s aa' % seqlist.seqLen(seq) ] acc = seqlist.seqAcc(seq) if os.path.exists('%s%s.log' % (haqdir, acc)): cseq[0] = '<a href="%s%s.html">%s</a>' % (haqdir, acc, cseq[0]) ctext += '%s\n' % string.join(cseq, '\t') htabs.append( ('Hits', rje_html.tableToHTML(ctext, '\t', tabid='parse'), 'Target sequences hit by candidates.')) # GABLAM/HMM table (with above links) if gdb: ctext = '%s\n' % string.join(gdb.fields(), '\t') for gline in open('%s.gablam.tdt' % basefile, 'r').readlines()[1:]: gdata = string.split(gline, '\t') acc = string.split(gdata[0], '__')[-1] gdata[ 0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % ( acc, gdata[0]) acc = string.split(gdata[1], '__')[-1] gdata[1] = '<a href="%s%s.html">%s</a>' % (haqdir, acc, gdata[1]) ctext += '%s\n' % string.join(gdata, '\t') htabs.append( ('GABLAM', rje_html.tableToHTML(ctext, '\t', tabid='parse'), 'GABLAM hit table.')) # Candidate list (with DB links) if candseq.seqNum(): ctext = '%s\n' % string.join( ['AccNum', 'ID', 'Descripton', 'Length'], '\t') accdict = candseq.makeSeqNameDic('accnum') for acc in rje.sortKeys(accdict): seq = accdict[acc] cseq = [ acc, candseq.seqID(seq), candseq.seqDesc(seq), '%s aa' % candseq.seqLen(seq) ] cseq[ 0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % ( acc, acc) ctext += '%s\n' % string.join(cseq, '\t') htabs.append(('Candidates', rje_html.tableToHTML(ctext, '\t', tabid='parse'), 'Candidate sequences to search.')) html += hobj.tabberHTML('GABLAM', htabs) html += hobj.htmlTail() open(hfile, 'w').write(html) ### ~ [3] Generate sequence-specific pages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #?# Move this to HAQESAC or MultiHAQ for i in range(len(hitlist)): hit = string.split(hitlist[i], '__')[-1] logfile = '%s%s.log' % (haqdir, hit) seqbase = logfile[:-4] hfile = '%s.html' % seqbase html = hobj.htmlHead(seqbase) # Front page should have: html += '<h1>%s</h1>\n\n' % seqbase html += '<p>Click <a href="../%s.html">here</a> to return to results summary. \n' % basefile if i: html += 'Previous: <a href="./%s.html"><code>%s</code></a>. \n' % ( string.split(hitlist[i - 1], '__')[-1], hitlist[i - 1]) if i < len(hitlist) - 1: html += 'Next: <a href="./%s.html"><code>%s</code></a>. \n' % ( string.split(hitlist[i + 1], '__')[-1], hitlist[i + 1]) html += '</p>\n' htabs = [] # (tab_id, tab_html_text[, tab_title]) for ftype in ['png', 'tree.txt', 'fas', 'nwk', 'log']: seqfile = '%s.%s' % (seqbase, ftype) if not os.path.exists(seqfile): continue tabtext = '<p><a href="./%s">./%s</a></p>\n' % ( os.path.basename(seqfile), os.path.basename(seqfile)) if ftype == 'png': tabtext += '<a href="./%s"><img src="%s" width="100%%"></a>\n' % ( os.path.basename(seqfile), os.path.basename(seqfile)) tabdesc = 'PNG of %s tree.' % seqbase else: tabtext += '<pre>%s</pre>\n' % open(seqfile, 'r').read() if ftype == 'tree.txt': for xref in hitlist: reptext = '<a href="./%s.html">%s</a>' % ( string.split(xref, '__')[-1], xref) tabtext = string.replace( tabtext, ': %s ' % xref, ': %s ' % reptext) while rje.matchExp('(: \S+_(\S+)__(\S+) )', tabtext): (oldtext, sid, spec, spacc) = rje.matchExp( '(: (\S+)_(\S+)__(\S+) )', tabtext) newtext = ': %s_<a href="http://www.uniprot.org/taxonomy/?query=%s&sort=score" target="_blank">%s</a>__<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a> ' % ( sid, spec, spec, spacc, spacc) tabtext = string.replace( tabtext, oldtext, newtext) tabdesc = '%s output' % seqfile htabs.append((ftype, tabtext, tabdesc)) if htabs: html += hobj.tabberHTML(os.path.basename(seqbase), htabs) else: html += '<p><i>No output found for <code>%s</code>!</i></p>\n' % hit html += hobj.htmlTail() open(hfile, 'w').write(html) except: self.errorLog('Problem with %s.makeHTML()' % self.prog())
def seqSubset2( self ): ### Extracts sequence subset from MOUSE cDNA and Peptide libraries '''Extracts sequence subset from MOUSE cDNA and Peptide libraries.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if os.path.exists('%s.map.tdt' % self.baseFile()): mdb = self.db().addTable('%s.map.tdt' % self.baseFile(), mainkeys=['Ingolia'], name='map') else: ### ~ [2] Load Mouse Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xfile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.data.tdt' xref = db.addTable(xfile, mainkeys=['Gene'], name='xref') afile = '../../../../../Databases/DBase_120225/MGI/mousemap.120324.alias.tdt' self.obj['Map'] = rje_genemap.GeneMap(self.log, self.cmd_list) #self.obj['Map'].loadPickle('../../../../../Databases/DBase_120225/MGI/mousemap.120324.pickle') self.obj['Map'].loadData( ['sourcedata=%s' % xfile, 'aliases=%s' % afile]) ing_genes = string.split( string.join( self.db('starts').index('Gene').keys()).upper()) map = self.obj['Map'] ing_map = {} for gene in ing_genes: ing_map[gene] = map.bestMap(gene) ing_mgi = rje.sortUnique(ing_map.values()) self.printLog( '#MUSG', '%s Ingolia genes mapped onto %s MGI genes' % (rje.iLen(ing_genes), rje.iLen(ing_mgi))) xdb = self.db('xref') bad_genes = [] for gene in ing_mgi[0:]: if gene not in xdb.data(): self.printLog( '#MAP', 'Cannot map gene "%s" from Ingolia data!' % gene) bad_genes.append(gene) ing_mgi.remove(gene) self.printLog( '#BAD', 'Failed to map %s genes from Ignolia' % rje.iLen(bad_genes)) open('ingolia.bad.txt', 'w').write(string.join(bad_genes)) ### ~ [2] EnsEMBL subset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ing_musg = xdb.dataList(xdb.entryList(ing_mgi), 'EnsEMBL', sortunique=True) if '' in ing_musg: ing_musg.remove('') self.printLog( '#MUSG', '%s Ingolia genes mapped onto %s EnsEMBL genes' % (rje.iLen(ing_genes), rje.iLen(ing_musg))) if not ing_musg: raise ValueError self.deBug(ing_musg[:10]) for stype in ['cdna', 'pep']: seqfile = '../MOUSE/Mus_musculus.NCBIM37.66.%s.all.fa' % stype if self.getBool('Force') or not os.path.exists(seqfile): seqout = 'Ingolia.%s.all.fa' % stype seqcmd = self.cmd_list + [ 'seqin=%s' % seqfile, 'seqout=%s' % seqout, 'autofilter=T', 'autload=T', 'seqmode=file', 'gooddesc=%s' % string.join(ing_musg, ',') ] rje_seqlist.SeqList(self.log, seqcmd) mdb = self.db().addEmptyTable('map', ['Ingolia', 'Gene', 'EnsEMBL'], ['Ignolia']) for gene in ing_map: entry = {'Ingolia': gene, 'Gene': ing_map[gene]} if entry['Gene'] in bad_genes: entry['EnsEMBL'] = '' else: entry['EnsEMBL'] = xdb.data()[ing_map[gene]]['EnsEMBL'] mdb.addEntry(entry) seqfile = 'Ingolia.cdna.all.fa' seqcmd = self.cmd_list + [ 'seqin=%s' % seqfile, 'autofilter=F', 'autload=T', 'seqmode=file' ] iseq = rje_seqlist.SeqList(self.log, seqcmd) if 'ENST' not in mdb.fields(): mdb.addField('ENST', evalue='') while iseq.nextSeq(): (iname, icdna) = iseq.getSeq() musg = rje.matchExp('gene:(\S+)', iname)[0] for entry in mdb.indexEntries('EnsEMBL', musg): if entry['ENST']: entry['ENST'] += ',%s' % string.split(iname)[0] else: entry['ENST'] = string.split(iname)[0] mdb.saveToFile() ### ~ [3] Generate new start sites from Ignolia Harrington data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb = self.db('starts') sdb.dataFormat({'Init Codon [nt]': 'int'}) icod = 'Init Codon [nt]' icon = 'Init Context [-3 to +4]' sdb.info['Name'] = 'mapped_start' sdb.addField('ENST') sdb.addField('ENSP') sdb.addField('ENSI') ENST = open('IngExact.cdna.all.fa', 'w') ENSP = open('IngExact.pep.all.fa', 'w') ex = 0.0 etot = sdb.entryNum() sx = 0 fx = 0 minpep = 20 for entry in sdb.entries(): self.progLog( '\r#ING', 'Mapping Ignolia Harrington Starts: %.2f%%' % (ex / etot)) ex += 100.0 #self.deBug(entry) entry[icon] = entry[icon].upper() gene = entry['Gene'].upper() mentry = mdb.data(gene) entry['ENST'] = entry['ENSI'] = '' cdnaseq = peptseq = '' if not mentry or not mentry['ENST']: fx += 1 continue #self.deBug(mentry) mtype = 'fail' for trans in string.split(mentry['ENST'], ','): (tname, tseq) = iseq.getDictSeq(trans, format='tuple') self.deBug('%s vs %s' % (tseq[entry[icod] - 3:][:7], entry[icon])) if tseq[entry[icod] - 3:][:7] == entry[icon]: ipept = string.split( rje_sequence.dna2prot(tseq[entry[icod]:]), '*')[0] self.deBug(ipept) if len(ipept) > len(peptseq): entry['ENST'] = trans cdnaseq = tseq peptseq = ipept mtype = 'exact' if not entry['ENST']: self.printLog( '\r#ING', 'Unable to find Harrington start for %s %s (%s)' % (gene, entry[icod], entry[icon]), screen=False) fx += 1 continue elif len(peptseq) < minpep: self.printLog( '\r#ING', 'Peptide from mapped Harrington start for %s %s (%s) too short!' % (gene, entry[icod], entry[icon]), screen=False) fx += 1 continue id = rje.preZero(int(ex / 100), etot) entry['ENSI'] = 'ENSINGT%s' % id entry['ENSP'] = 'ENSINGP%s' % id ENST.write( '>ENSINGT%s mtype:%s enst:%s gene:%s ingolia:%s mgi:%s\n%s\n' % (id, mtype, entry['ENST'], mentry['EnsEMBL'], entry['Gene'], mentry['Gene'], cdnaseq)) ENSP.write( '>ENSINGP%s mtype:%s enst:%s gene:%s transcript:ENSINGT%s ingolia:%s mgi:%s\n%s\n' % (id, mtype, entry['ENST'], mentry['EnsEMBL'], id, entry['Gene'], mentry['Gene'], peptseq)) sx += 1 sdb.saveToFile('%s.mapped_exact.tdt' % self.baseFile()) ENST.close() ENSP.close() self.printLog( '\r#ING', 'Output %s Ingolia peptides and transcripts. %s failed.' % (rje.iStr(sx), rje.iStr(fx))) return except: self.errorLog('%s.method error' % self)