def convert(self,filelist=[],outfile=None): ### Converts scansite output files in FileList to Outfile ''' Converts scansite output files in FileList to Outfile. ''' try: ### Setup ### _stage = 'Setup' if len(filelist) < 1: filelist = self.list['FileList'] if not outfile: outfile = self.info['Name'] if len(filelist) < 1: self.log.errorLog('No scansite files to convert! %s unchanged/not made.' % outfile,printerror=False) return False delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) if ext != outfile[-3:]: newfile = outfile[:-3] + ext if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)): outfile = newfile self.log.printLog('#OUT','Converting %d file(s), output to %s.' % (len(filelist),outfile)) ### Output File ### _stage = 'Output File' if not self.opt['Append'] or not os.path.exists(outfile): # Create with header OUTFILE = open(outfile,'w') headers = ['seq_id','enzyme','enz_group','aa','pos','score','percentile','matchseq','sa'] rje.writeDelimit(OUTFILE,headers,delimit) else: OUTFILE = open(outfile,'a') ### Conversion ### _stage = 'Conversion' sx = 0 for infile in filelist: if not os.path.exists(infile): self.log.errorLog('Input file %s does not exist! :o(' % infile,False,False) continue fx = 0 INFILE = open(infile,'r') inline = rje.nextLine(INFILE) while inline != None: if rje.matchExp(re_scansite,inline): scanlist = rje.matchExp(re_scansite,inline) rje.writeDelimit(OUTFILE,scanlist,delimit) sx += 1 fx += 1 rje.progressPrint(self,sx) inline = rje.nextLine(INFILE) self.log.printLog('#OUT','%s scansite results from %s. (%s Total.)' % (rje.integerString(fx),infile,rje.integerString(sx))) INFILE.close() ### End ### _stage = 'End' OUTFILE.close() self.log.printLog('#OUT','%s scansite results output to %s.' % (rje.integerString(sx),outfile)) return True except: self.log.errorLog('Error in convert(%s)' % _stage,printerror=True,quitchoice=False) raise
def pileUpFDR(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### fdrfile = '%s.fdr.tdt' % self.baseFile() if not self.force() and os.path.exists(fdrfile): return sigpval = {} # pval:[fpos] npos = 0; nx = 0 for locus in rje.sortKeys(self.dict['RefSeq']): npos += len(self.dict['RefSeq'][locus]) - self.dict['RefSeq'][locus].count('?') ### ~ [1] Parse out stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r') headers = string.split(SAMSIG.readline()) + ['p.FDR'] fpos = SAMSIG.tell(); fline = SAMSIG.readline(); px = 0 while fline: self.progLog('\r#SIG','Reading Pvalues: %s p <= 0.05...' % rje.iStr(px)) try: pval = float(string.split(fline)[-1]) except: break if pval <= 0.05: if pval not in sigpval: sigpval[pval] = [] sigpval[pval].append(fpos); px += 1 fpos = SAMSIG.tell(); fline = SAMSIG.readline() self.printLog('\r#SIG','Reading Pvalues complete: %s p <= 0.05.' % rje.iStr(px)) ### ~ [2] Calculate FDR and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMFDR = open(fdrfile,'w') rje.writeDelimit(SAMFDR, headers) px = 0; sx = 0.0; stot = len(sigpval) for pval in rje.sortKeys(sigpval): self.progLog('\r#FDR','Calculating FDR: %.2f%%' % (sx/stot)); sx += 100.0 px += len(sigpval[pval]) if pval: fdr = (pval * npos) / px else: fdr = 0.0 for fpos in sigpval[pval]: SAMSIG.seek(fpos) rje.writeDelimit(SAMFDR,rje.readDelimit(SAMSIG.readline())+[rje.expectString(fdr)]) SAMSIG.close() SAMFDR.close() self.printLog('\r#FDR','%s FDR lines output to %s' % (rje.iStr(px),fdrfile)) except: self.errorLog('%s.pileUpFDR() error' % (self)); return None
def altPAM(self): ### Alternative PAM matrix construction '''Alternative PAM matrix construction.''' try: ### Setup ## wlines = self.loadFromFile(self.info['AltPam']) if not wlines: raise IOError aas = string.split(wlines[0].upper()) codes = string.split(wlines[1]) rawfreqs = string.split(wlines[2]) freq = {} for i in range(len(rawfreqs)): freq[aas[i]] = string.atof(rawfreqs[i]) prob = {} for r in range(3, 22): subs = string.split(wlines[r]) for i in range(len(subs)): prob['%s%s' % (aas[i], aas[r - 2])] = string.atof(subs[i]) prob['%s%s' % (aas[r - 2], aas[i])] = string.atof(subs[i]) ### Alternative freqs ### if self.info['SeqIn'].lower() not in [ '', 'none' ] and os.path.exists(self.info['SeqIn']): ## Clear freq ## freq = {} for a in aas: freq[a] = 0.0 ## Count freq ## slines = self.loadFromFile(self.info['SeqIn']) for line in slines: if line[:1] == '>': continue for a in aas: freq[a] += string.count(line.upper(), a) ## Convert to freq ## total = sum(freq.values()) if total > 0: for a in aas: freq[a] = freq[a] / total self.log.printLog( '#AA', 'Rescaling matrix based on %s aa from %s.' % (rje.integerString(total), self.info['SeqIn'])) ### Calculate s ### s = 0.01 step = 0.000001 solve = True bests = 1.000000 bestdif = -1 while solve and s >= step: ## Scaler ## s = s - step self.log.printLog( '\r#WAG', 'Considering s = %.6f; Best s = %.6f (Dif = %.6f)' % (s, bests, bestdif), log=False, newline=False) ## Self Subs ## newprobs = rje.scaledict(dict=prob, scale=s) toobig = False for a in aas: newprobs['%s%s' % (a, a)] = 1.0 for key in prob.keys(): if key[0] == a: newprobs['%s%s' % (a, a)] -= newprobs[key] if newprobs['%s%s' % (a, a)] < 0.0: # Overshot possibility toobig = True break if toobig: break if toobig: continue #print 'PAM!!', ## PAM1 ## dsum = 0.0 for a in aas: dsum += freq[a] * newprobs['%s%s' % (a, a)] dif = 0.99 - dsum if dif < 0: dif = -dif if dif < bestdif or bestdif < 0: bestdif = dif bests = s ### Output best s ### self.log.printLog( '\r#WAG', 'Considered all s <= 0.010000; Best s = %.6f (Dif = %.6f)' % (bests, bestdif)) if self.info['PamOut'].lower() in ['', 'none']: self.info['PamOut'] = self.info['AltPam'] + '.pam' self.log.printLog( '#PAM', 'Rescaled PAM matrix output to %s' % self.info['PamOut']) PAM = open(self.info['PamOut'], 'w') rje.writeDelimit(PAM, aas, ' ') newprobs = rje.scaledict(dict=prob, scale=bests) for a in aas: newprobs['%s%s' % (a, a)] = 1.0 for key in prob.keys(): if key[0] == a: newprobs['%s%s' % (a, a)] -= newprobs[key] for i in range(len(aas)): out = [codes[i]] a = aas[i] for b in aas: out.append('%.6f' % newprobs['%s%s' % (a, b)]) rje.writeDelimit(PAM, out, ' ') PAM.close() self.info['Name'] = self.info['PamOut'] except: self.log.errorLog('Major Error with PamCtrl.altPAM().', quitchoice=True)
def convert(self, filelist=[], outfile=None ): ### Converts scansite output files in FileList to Outfile ''' Converts scansite output files in FileList to Outfile. ''' try: ### Setup ### _stage = 'Setup' if len(filelist) < 1: filelist = self.list['FileList'] if not outfile: outfile = self.info['Name'] if len(filelist) < 1: self.log.errorLog( 'No scansite files to convert! %s unchanged/not made.' % outfile, printerror=False) return False delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) if ext != outfile[-3:]: newfile = outfile[:-3] + ext if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)): outfile = newfile self.log.printLog( '#OUT', 'Converting %d file(s), output to %s.' % (len(filelist), outfile)) ### Output File ### _stage = 'Output File' if not self.opt['Append'] or not os.path.exists( outfile): # Create with header OUTFILE = open(outfile, 'w') headers = [ 'seq_id', 'enzyme', 'enz_group', 'aa', 'pos', 'score', 'percentile', 'matchseq', 'sa' ] rje.writeDelimit(OUTFILE, headers, delimit) else: OUTFILE = open(outfile, 'a') ### Conversion ### _stage = 'Conversion' sx = 0 for infile in filelist: if not os.path.exists(infile): self.log.errorLog( 'Input file %s does not exist! :o(' % infile, False, False) continue fx = 0 INFILE = open(infile, 'r') inline = rje.nextLine(INFILE) while inline != None: if rje.matchExp(re_scansite, inline): scanlist = rje.matchExp(re_scansite, inline) rje.writeDelimit(OUTFILE, scanlist, delimit) sx += 1 fx += 1 rje.progressPrint(self, sx) inline = rje.nextLine(INFILE) self.log.printLog( '#OUT', '%s scansite results from %s. (%s Total.)' % (rje.integerString(fx), infile, rje.integerString(sx))) INFILE.close() ### End ### _stage = 'End' OUTFILE.close() self.log.printLog( '#OUT', '%s scansite results output to %s.' % (rje.integerString(sx), outfile)) return True except: self.log.errorLog('Error in convert(%s)' % _stage, printerror=True, quitchoice=False) raise
def _pepStats(self): ### Peptide Distance ''' Peptide Distance. ''' try: ### Setup ### seqlist = rje_seq.SeqList(self.log, self.cmd_list + ['autoload=T']) aaprop = rje_aaprop.AAPropMatrix(self.log, self.cmd_list) aaprop.makePropDif() delimit = rje.getDelimit(self.cmd_list) ### Output File Setup ### OUTFILE = open('hrb.pepstats.%s' % rje.delimitExt(delimit), 'w') headlist = ['peptide'] ## 10 Dimensional Peptide Property Output ## for property in rje.sortKeys(aaprop.prop): headlist.append(property.lower()) for aa in aaprop.prop[property].keys(): try: if aa not in ['-', 'X']: aaprop.prop[property][aa] = string.atoi( aaprop.prop[property][aa]) except: print aaprop.prop, property, aa, aaprop.prop[property][ aa] raise ## Additional Stats ## headlist.append('net_charge') #headlist.append('hydrophobicity') headlist.append('charge_balance') headlist.append('hydrophobic_balance') #headlist.append('hydrophobicity_balance') ## Output rje.writeDelimit(OUTFILE, headlist, delimit) ### Calculate stats ### for pep in seqlist.seq: pepname = pep.shortName() if rje.matchExp('^(\S+_\d[CQ])', pepname): pepname = rje.matchExp('^(\S+_\d[CQ])', pepname)[0] outlist = [pepname] pepseq = pep.info['Sequence'] ## 10 Dimensional Peptide Property Output ## for property in rje.sortKeys(aaprop.prop): px = 0 for aa in pepseq: px += aaprop.prop[property][aa] outlist.append('%d' % px) ## Additional Stats ## net_charge = 0 for aa in pepseq: net_charge += (aaprop.prop['Positive'][aa] - aaprop.prop['Negative'][aa]) outlist.append('%d' % net_charge) charge_balance = 0 hydrophobic_balance = 0 for r in range(len(pepseq)): charge_balance += aaprop.prop['Charged'][pepseq[r]] * ( 1.0 / (r + 1)) charge_balance -= aaprop.prop['Charged'][pepseq[r]] * ( 1.0 / (10 - r)) hydrophobic_balance += aaprop.prop['Hydrophobic'][ pepseq[r]] * (1.0 / (r + 1)) hydrophobic_balance -= aaprop.prop['Hydrophobic'][ pepseq[r]] * (1.0 / (10 - r)) outlist.append('%.3f' % charge_balance) outlist.append('%.3f' % hydrophobic_balance) rje.writeDelimit(OUTFILE, outlist, delimit) ### Finish ### OUTFILE.close() except: self.log.errorLog('Error in _pepStats', printerror=True, quitchoice=False) raise # Delete this if method error not terrible
def altPAM(self): ### Alternative PAM matrix construction '''Alternative PAM matrix construction.''' try: ### Setup ## wlines = self.loadFromFile(self.info['AltPam']) if not wlines: raise IOError aas = string.split(wlines[0].upper()) codes = string.split(wlines[1]) rawfreqs = string.split(wlines[2]) freq = {} for i in range(len(rawfreqs)): freq[aas[i]] = string.atof(rawfreqs[i]) prob = {} for r in range(3,22): subs = string.split(wlines[r]) for i in range(len(subs)): prob['%s%s' % (aas[i],aas[r-2])] = string.atof(subs[i]) prob['%s%s' % (aas[r-2],aas[i])] = string.atof(subs[i]) ### Alternative freqs ### if self.info['SeqIn'].lower() not in ['','none'] and os.path.exists(self.info['SeqIn']): ## Clear freq ## freq = {} for a in aas: freq[a] = 0.0 ## Count freq ## slines = self.loadFromFile(self.info['SeqIn']) for line in slines: if line[:1] == '>': continue for a in aas: freq[a] += string.count(line.upper(),a) ## Convert to freq ## total = sum(freq.values()) if total > 0: for a in aas: freq[a] = freq[a] / total self.log.printLog('#AA','Rescaling matrix based on %s aa from %s.' % (rje.integerString(total),self.info['SeqIn'])) ### Calculate s ### s = 0.01 step = 0.000001 solve = True bests = 1.000000 bestdif = -1 while solve and s >= step: ## Scaler ## s = s - step self.log.printLog('\r#WAG','Considering s = %.6f; Best s = %.6f (Dif = %.6f)' % (s,bests,bestdif),log=False,newline=False) ## Self Subs ## newprobs = rje.scaledict(dict=prob,scale=s) toobig = False for a in aas: newprobs['%s%s' % (a,a)] = 1.0 for key in prob.keys(): if key[0] == a: newprobs['%s%s' % (a,a)] -= newprobs[key] if newprobs['%s%s' % (a,a)] < 0.0: # Overshot possibility toobig = True break if toobig: break if toobig: continue #print 'PAM!!', ## PAM1 ## dsum = 0.0 for a in aas: dsum += freq[a] * newprobs['%s%s' % (a,a)] dif = 0.99 - dsum if dif < 0: dif = -dif if dif < bestdif or bestdif < 0: bestdif = dif bests = s ### Output best s ### self.log.printLog('\r#WAG','Considered all s <= 0.010000; Best s = %.6f (Dif = %.6f)' % (bests,bestdif)) if self.info['PamOut'].lower() in ['','none']: self.info['PamOut'] = self.info['AltPam'] + '.pam' self.log.printLog('#PAM','Rescaled PAM matrix output to %s' % self.info['PamOut']) PAM = open(self.info['PamOut'],'w') rje.writeDelimit(PAM,aas,' ') newprobs = rje.scaledict(dict=prob,scale=bests) for a in aas: newprobs['%s%s' % (a,a)] = 1.0 for key in prob.keys(): if key[0] == a: newprobs['%s%s' % (a,a)] -= newprobs[key] for i in range(len(aas)): out = [codes[i]] a = aas[i] for b in aas: out.append('%.6f' % newprobs['%s%s' % (a,b)]) rje.writeDelimit(PAM,out,' ') PAM.close() self.info['Name'] = self.info['PamOut'] except: self.log.errorLog('Major Error with PamCtrl.altPAM().',quitchoice=True)
def run(self): ### Main Run method ''' Main Run method. ''' try: ### SLiMDisc Run ### if self.opt['SLiMDisc']: return self.slimDisc() ### TEIRESIAS ### if self.opt['Teiresias']: ## Setup ## seqlist = rje_seq.SeqList(self.log,self.cmd_list) infile = '%s.teiresias.fas' % rje.baseFile(seqlist.info['Name'],True) outfile = '%s.teiresias.out' % rje.baseFile(seqlist.info['Name'],True) run_teiresias = True if rje.isYounger(outfile,infile) == outfile: if self.stat['Interactive'] < 1 or not rje.yesNo('%s and %s exist already. Regenerate?' % (infile,outfile),'N'): run_teiresias = False ## Run TEIRESIAS ## if run_teiresias: seqlist.saveFasta(seqfile=infile,name='Teiresias') ### Saves sequences in fasta format command = rje.makePath(self.info['TeiresiasPath'],True) command += ' -i%s -o%s %s' % (infile,outfile,self.info['TeiresiasOpt']) self.log.printLog('#CMD',command) os.system(command) ## Read Results ## self.verbose(0,2,'Reading TEIRESIAS output from %s...' % outfile,1) self.list['Pattern'] = [] RESULTS = open(outfile,'r') line = RESULTS.readline() while line: if rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line): # New pattern self.addTeiresiasPattern(rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line)) elif len(line) > 3 and line[0] != '#': self.log.errorLog('Did not recognise line: %s' % line,False,False) line = RESULTS.readline() RESULTS.close() patx = len(self.list['Pattern']) self.log.printLog('#PAT','%s TEIRESIAS patterns read from %s.' % (rje.integerString(patx),outfile)) ## Calculate Information Content ## aafreq = seqlist.aaFreq() self.verbose(0,3,'Calculating Information Content & Length stats...',0) occx = 0 for pattern in self.list['Pattern']: pattern.stat['Info'] = self.calculateScore(pattern.info['Pattern'],aafreq) pattern._makeLength() occx += 1 rje.progressPrint(self,occx,patx/100,patx/10) self.verbose(0,1,'...Done!',2) ## Prepare Results ## delimit = rje.getDelimit(self.cmd_list) if self.info['Name'] == 'None': self.info['Name'] = '%s.teiresias.%s' % (rje.baseFile(seqlist.info['Name'],True),rje.delimitExt(delimit)) if self.opt['MySQL']: # Two tables patfile = os.path.splitext(self.info['Name']) occfile = '%s.occ%s' % (patfile[0],patfile[1]) patfile = '%s.patterns%s' % (patfile[0],patfile[1]) if self.opt['Append']: PATFILE = open(patfile,'a') OCCFILE = open(occfile,'a') else: PATFILE = open(patfile,'w') rje.writeDelimit(PATFILE,['pattern','tot_occ','seq_occ','info','len','fix','wild'],delimit) OCCFILE = open(occfile,'a') rje.writeDelimit(OCCFILE,['seq_id','pos','pattern','pat_match'],delimit) else: if self.opt['Append']: RESFILE = open(self.info['Name'],'a') else: RESFILE = open(patfile,'w') rje.writeDelimit(RESFILE,['Sequence Name','Position','Pattern','Match','Total Occurrences','Num Sequences','Information Content','Length','Fixed','Wildcard'],delimit) ## Save Results ## occx = 0 for pattern in self.list['Pattern']: patstats = [] for stat in ['OccCount','SeqCount','Info','Length','Fixed','Wildcards']: patstats.append('%d' % pattern.stat[stat]) patstats[2] = '%.3f' % pattern.stat['Info'] if self.opt['MySQL']: # Two tables rje.writeDelimit(PATFILE,[pattern.info['Pattern']] + patstats,delimit) for occ in rje.sortKeys(pattern.occ): seq = seqlist.seq[occ] for pos in pattern.occ[occ]: match = seq.info['Sequence'][pos:(pos+pattern.stat['Length'])] outlist = [seq.shortName(),'%d' % pos,pattern.info['Pattern'],match] if self.opt['MySQL']: # Two tables rje.writeDelimit(OCCFILE,outlist,delimit) else: rje.writeDelimit(RESFILE,outlist+patstats,delimit) occx += 1 if self.opt['MySQL']: # Two tables PATFILE.close() OCCFILE.close() self.log.printLog('#OUT','%s patterns output to %s.' % (rje.integerString(patx),patfile)) self.log.printLog('#OUT','%s pattern occurrences output to %s.' % (rje.integerString(occx),occfile)) else: RESFILE.close() self.log.printLog('#OUT','%s occurrences of %s patterns output to %s.' % (rje.integerString(occx),rje.integerString(patx),self.info['Name'])) ### InfoContent ### elif self.info['Info'] != 'None': ## Setup ## alphabet = rje_seq.alph_protx if not os.path.exists(self.info['Info']): self.log.errorLog('Input file %s missing!' % self.info['Info'],False,False) return False else: mypresto = presto.Presto(self.log,self.cmd_list) mypresto.loadMotifs(file=self.info['Info'],clear=True) seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T']) if seqlist.seqNum() > 0: aafreq = seqlist.aaFreq(alphabet=None,fromfile=None,loadfile=None,total=False) ### Returns dictionary of AA (& gap etc.) frequencies else: aafreq = {} for aa in alphabet: aafreq[aa] = 1.0 / len(alphabet) alphabet = aafreq.keys() maxinfo = 0 for aa in alphabet: maxinfo += (aafreq[aa] * math.log(aafreq[aa],2)) ## Output ## delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) outfile = '%s.info.%s' % (rje.baseFile(self.info['Info'],True,['.txt','.%s' % ext]),ext) if self.opt['Append']: OUTFILE = open(outfile,'a') else: OUTFILE = open(outfile,'w') rje.writeDelimit(OUTFILE,['motif','pattern','info'],delimit) ## Calculate Information Scores ## for motif in mypresto.motif: self.verbose(2,4,motif.info['Sequence'],0) pattern = string.replace(motif.info['Sequence'],'X','.') elements = string.split(pattern,'-') pattern = '' for el in elements: if el.find('.{') == 0: # Ambiguous spacer length - compress pattern += '.' else: pattern += el self.verbose(2,2,'=> %s' % pattern,1) motif.stat['Info'] = self.calculateInformationContent(pattern,aafreq,maxinfo,self.stat['InfoGapPen']) self.verbose(0,3,'%s (%s) = %.2f' % (motif.info['Name'],pattern,motif.stat['Info']),1) ## Output ## rje.writeDelimit(OUTFILE,[motif.info['Name'],pattern,'%.2f' % motif.stat['Info']],delimit) ## Finish ## OUTFILE.close() except: self.log.errorLog('Error in run().',printerror=True,quitchoice=False) raise # Delete this if method error not terrible
def _pepStats(self): ### Peptide Distance ''' Peptide Distance. ''' try: ### Setup ### seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T']) aaprop = rje_aaprop.AAPropMatrix(self.log,self.cmd_list) aaprop.makePropDif() delimit = rje.getDelimit(self.cmd_list) ### Output File Setup ### OUTFILE = open('hrb.pepstats.%s' % rje.delimitExt(delimit),'w') headlist = ['peptide'] ## 10 Dimensional Peptide Property Output ## for property in rje.sortKeys(aaprop.prop): headlist.append(property.lower()) for aa in aaprop.prop[property].keys(): try: if aa not in ['-','X']: aaprop.prop[property][aa] = string.atoi(aaprop.prop[property][aa]) except: print aaprop.prop, property, aa, aaprop.prop[property][aa] raise ## Additional Stats ## headlist.append('net_charge') #headlist.append('hydrophobicity') headlist.append('charge_balance') headlist.append('hydrophobic_balance') #headlist.append('hydrophobicity_balance') ## Output rje.writeDelimit(OUTFILE,headlist,delimit) ### Calculate stats ### for pep in seqlist.seq: pepname = pep.shortName() if rje.matchExp('^(\S+_\d[CQ])',pepname): pepname = rje.matchExp('^(\S+_\d[CQ])',pepname)[0] outlist = [pepname] pepseq = pep.info['Sequence'] ## 10 Dimensional Peptide Property Output ## for property in rje.sortKeys(aaprop.prop): px = 0 for aa in pepseq: px += aaprop.prop[property][aa] outlist.append('%d' % px) ## Additional Stats ## net_charge = 0 for aa in pepseq: net_charge += (aaprop.prop['Positive'][aa] - aaprop.prop['Negative'][aa]) outlist.append('%d' % net_charge) charge_balance = 0 hydrophobic_balance = 0 for r in range(len(pepseq)): charge_balance += aaprop.prop['Charged'][pepseq[r]] * (1.0 / (r+1)) charge_balance -= aaprop.prop['Charged'][pepseq[r]] * (1.0 / (10-r)) hydrophobic_balance += aaprop.prop['Hydrophobic'][pepseq[r]] * (1.0 / (r+1)) hydrophobic_balance -= aaprop.prop['Hydrophobic'][pepseq[r]] * (1.0 / (10-r)) outlist.append('%.3f' % charge_balance) outlist.append('%.3f' % hydrophobic_balance) rje.writeDelimit(OUTFILE,outlist,delimit) ### Finish ### OUTFILE.close() except: self.log.errorLog('Error in _pepStats',printerror=True,quitchoice=False) raise # Delete this if method error not terrible
def pileUpStats(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### statfile = '%s.pdiff.tdt' % self.baseFile() if not self.force() and os.path.exists(statfile): return self.pileUpFDR() ## ~ [0a] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## wtdata = {} # Load lists of data for compiling for locus in self.dict['RefSeq']: wtdata[locus] = {} for field in ['N','QN','MajFreq']: wtdata[locus][field] = [] WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 1 fields = [] for line in WTDATA: data = rje.readDelimit(line) if fields: locus = data[0] pos = int(data[1]) while pos > wx: wtdata[locus]['N'].append(0); wtdata[locus]['QN'].append(0); wtdata[locus]['MajFreq'].append(0.0); wx += 1 for field in ['N','QN']: wtdata[locus][field].append(int(data[fields.index(field)])) for field in ['MajFreq']: wtdata[locus][field].append(string.atof(data[fields.index(field)])) wx += 1 else: fields = data[0:] WTDATA.close() ## ~ [0b] Load WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mutdata = {} # Load lists of data for compiling for locus in self.dict['RefSeq']: mutdata[locus] = {} for field in ['N','QN','Major','MajFreq','WTFreq']: mutdata[locus][field] = [] MUTDATA = open('%s.Mut.tdt' % self.baseFile(),'r'); mx = 1 fields = [] for line in MUTDATA: data = rje.readDelimit(line) if fields: locus = data[0] self.str['RefSeq'] = self.dict['RefSeq'][locus] pos = int(data[1]) try: if pos > len(self.str['RefSeq']): while (pos-1) > len(self.str['RefSeq']): self.str['RefSeq'] += '?' self.str['RefSeq'] += data[2] self.dict['RefSeq'][locus] = self.str['RefSeq'] elif self.str['RefSeq'][pos-1] == '?': self.str['RefSeq'] = self.str['RefSeq'][:pos-1] + data[2] + self.str['RefSeq'][pos:] self.dict['RefSeq'][locus] = self.str['RefSeq'] except: self.warnLog('Problem mapping Pos %s onto %snt %s RefSeq' % (rje.iStr(pos),locus,rje.iLen(self.str['RefSeq']))) while pos > mx: mutdata[locus]['N'].append(0); mutdata[locus]['QN'].append(0); mutdata[locus]['Major'].append('-'); mutdata[locus]['MajFreq'].append(0.0); mutdata[locus]['WTFreq'].append(0.0); mx += 1 for field in ['N','QN']: mutdata[locus][field].append(int(data[fields.index(field)])) for field in ['MajFreq','WTFreq']: mutdata[locus][field].append(string.atof(data[fields.index(field)])) for field in ['Major']: mutdata[locus][field].append(data[fields.index(field)]) mx += 1 else: fields = data[0:] MUTDATA.close() ## ~ [0c] Integrity check ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #!# Need a new check with locus info #!# #for field in wtdata: #!# Won't be true - not all reference genome positions present in output (0 mapped reads) # if len(wtdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for WT %s' % field,printerror=False); raise ValueError #for field in mutdata: #!# Won't be true - not all reference genome positions present in output (0 mapped reads) # if len(mutdata[field]) != len(self.str['RefSeq']): self.errorLog('Data length mismatch for Mutant %s' % field,printerror=False); raise ValueError #self.printLog('#REF','WT and Mutant data for %s reference positions' % rje.iLen(self.str['RefSeq'])) ### ~ [1] Assess and output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'w') headers = ['Locus','Pos','Ref','WT.N','WT.QN','WT.Major','WT.MajFreq','Mut.N','Mut.QN','Mut.Major','Mut.MajFreq','Mut.WTFreq','p.Over','p.Under','p.Diff'] SAMSIG.write('%s\n' % string.join(headers,'\t')) nodifx = 0; nomutx = 0; sx = 0 for locus in rje.sortKeys(self.dict['RefSeq']): self.str['RefSeq'] = self.dict['RefSeq'][locus] self.list['WTMajor'] = self.dict['WTMajor'][locus] for i in range(len(self.str['RefSeq'])): try: sigdata = [locus,i+1,self.str['RefSeq'][i],wtdata[locus]['N'][i],wtdata[locus]['QN'][i],self.list['WTMajor'][i],wtdata[locus]['MajFreq'][i], mutdata[locus]['N'][i],mutdata[locus]['QN'][i],mutdata[locus]['Major'][i],mutdata[locus]['MajFreq'][i],mutdata[locus]['WTFreq'][i]] except: self.warnLog('Incomplete data for %s:%s (no pdiff output)' % (locus,rje.iStr(i+1))); continue if self.getBool('MajDif') and self.list['WTMajor'][i] == mutdata[locus]['Major'][i]: nodifx += 1; continue # Was: sigdata += [1.0,1.0] elif self.getBool('MajMut') and self.str['RefSeq'][i] == mutdata[locus]['Major'][i]: nomutx += 1;continue elif not wtdata[locus]['MajFreq'][i]: # No Data for WT if mutdata[locus]['WTFreq'][i]: sigdata += [0.0,1.0] else: sigdata += [1.0,1.0] elif mutdata[locus]['WTFreq'][i] > wtdata[locus]['MajFreq'][i]: obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5) sigdata.append(rje.binomial(obs,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self)) sigdata.append(1.0) elif mutdata[locus]['WTFreq'][i] < wtdata[locus]['MajFreq'][i]: obs = int((mutdata[locus]['QN'][i] * mutdata[locus]['WTFreq'][i]) + 0.5) sigdata.append(1.0) sigdata.append(1.0 - rje.binomial(obs+1,mutdata[locus]['QN'][i],wtdata[locus]['MajFreq'][i],usepoisson=False,callobj=self)) else: sigdata += [1.0,1.0] sigdata.append(min(1.0,2*min(sigdata[-2:]))) rje.writeDelimit(SAMSIG,sigdata); sx += 1 SAMSIG.close() ptxt = '%s lines output to *.pdiff.txt' % rje.iStr(sx) if self.getBool('MajDif'): ptxt += '; %s positions skipped where WTMajor==MutMajor (majdif=T)' % rje.iStr(nodifx) if self.getBool('MajMut'): ptxt += '; %s positions skipped where Ref==MutMajor (majmut=T)' % rje.iStr(nomutx) self.printLog('#PDIFF','%s.' % ptxt) ### ~ [2] FDR Correction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.pileUpFDR() except: self.errorLog('%s.pileUpStats() error' % (self)); return None
def parsePileup(self,tname,filename,wtdb=None): ### Extracts, filters and processes PileUp data '''Extracts, filters and processes PileUp data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### table = self.db().addEmptyTable(tname,['Locus','Pos','Seq','N','QN','Major','MajFreq'],keys=['Locus','Pos']) qc = [] if wtdb: table.addField('WTFreq') PILEUP = open(filename,'r'); px = 0; ex = 0 PILEOUT = open('%s.%s.tdt' % (self.baseFile(),tname),'w') rje.writeDelimit(PILEOUT,outlist=table.fields(),delimit='\t') locus = None refseq = '' #? What is this used for? majors = [] #? What is this used for? ### ~ [2] Process each entry ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in PILEUP: # Split line up into data. Should be: locus, position, reference, no. reads, read data, qualscores data = string.split(rje.chomp(line)) if not data: break self.progLog('\r#PARSE','Parsing %s: %s pos...' % (filename,rje.iStr(px)),rand=0.01); px += 1 ## ~ [2a] Extract Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## entry = {'Locus':data[0],'Pos':int(data[1]),'Seq':data[2],'N':int(data[3]),'QN':0} if entry['Locus'] != locus: locus = entry['Locus']; refseq = ''; majors = [] refseq += data[2] #entry => 'Ref','Pos','Seq','N','Reads','Qual' rseq = data[4] reads = [] delx = 0 while rseq: try: if rseq[:1] in ['.',',']: reads.append(entry['Seq']); rseq = rseq[1:] elif rseq[:1] == '^': rseq = rseq[2:] #elif rseq[:1] == '*': # reads.append('-1%s' % entry['Seq'].upper()) # rseq = rseq[1:] elif rseq[:1] in ['-','+']: ilen = string.atoi(rje.matchExp('^(\d+)',rseq[1:])[0]) indel = rseq[len('%s' % ilen)+1:][:ilen] #self.deBug('%s: %s' % (rseq,indel)) if rseq[:1] == '-': delx += 1 reads.append(rseq[:len('%s' % ilen)+ilen+1].upper()) else: reads[-1] += indel.upper() #self.deBug(reads[-1]) rseq = rseq[len('%s' % ilen)+ilen+1:] elif rseq[:1] in ['$']: rseq = rseq[1:] else: if rseq[0].upper() not in 'ATGCN*': print ' ???', rseq[0].upper(), '???' reads.append(rseq[0].upper()); rseq = rseq[1:] except: self.errorLog('!') self.deBug(rseq) raise ValueError if len(reads) != (entry['N'] + delx): self.deBug('%s = %d' % (data[4],entry['N'])) self.deBug('%s = %d' % (reads,len(reads))) self.errorLog('Read versus Read Count mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False) raise ValueError ## ~ [2b] Convert Quality Scores ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## qual = [] for q in data[5]: # Gaps do not have a quality score, so fill these in first while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut')) # Then append actual qv qual.append(ord(q) - 33) qc += [0] * (qual[-1] - len(qc)); qc[qual[-1]-1] += 1 while len(qual) < len(reads) and reads[len(qual)][0] == '-': qual.append(self.getInt('QCut')) while '*' in reads: reads[reads.index('*')] = '-' #'-1%s' % entry['Seq'].upper() if len(reads) != len(qual): self.deBug('%s = %d' % (reads,len(reads))) self.deBug('%s = %d' % (qual,len(qual))) self.deBug(data) self.errorLog('Read versus Quality length mismatch for %s Pos %s' % (table.name(),entry['Pos']),printerror=False) raise ValueError ## ~ [2c] Filter low quality ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if entry['Pos'] in [190359]: #100,98901,183697,169284, self.deBug(qual) self.deBug(reads) self.deBug(qc) # Remove (from back) any reads than do not meet QV cutoff for r in range(len(qual)-1,-1,-1): if qual[r] < self.getInt('QCut'): qual.pop(r); reads.pop(r) entry['QN'] = len(reads) ## ~ [2d] Major Allele ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## alleles = {} # Dictionary of {nt:count} # Setup major allele if reads: major = reads[0] else: major = '-'; alleles[major] = 0 # Cycle through reads. Keep most abundant allele as major - or reference allele if tied. for read in reads: if read in alleles: alleles[read] += 1 else: alleles[read] = 1 if alleles[read] > alleles[major] or (read == entry['Seq'] and alleles[read] == alleles[major]): major = read entry['Major'] = major majors.append(major) if reads: entry['MajFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[major]) / float(len(reads))) else: entry['MajFreq'] = 0.0 if wtdb: try: wtmajor = self.dict['WTMajor'][locus][entry['Pos']-1] if wtmajor in alleles and reads: entry['WTFreq'] = 1.0 - max(self.getNum('MinFreq'),(len(reads) - alleles[wtmajor]) / float(len(reads))) else: entry['WTFreq'] = 0.0 if wtmajor != major: self.debug(entry) elif locus == 'chrIV_S288C__BK006938.2' and entry['Pos'] == 271733: self.debug(entry) except: self.warnLog('WTFreq Error (%s:Pos=%d) [Probably no WT read mapped]' % (locus,entry['Pos'])); entry['WTFreq'] = 0.0 if entry['Pos'] in [190359]: #100,98901,183697,169284, self.deBug(qual) self.deBug(reads) self.deBug(alleles) self.deBug(entry) self.deBug(line) #table.addEntry(entry) outlist = [] for field in table.fields(): outlist.append(entry[field]) rje.writeDelimit(PILEOUT,outlist,delimit='\t'); ex += 1 self.printLog('\r#PARSE','Parsed %s: %s entries from %s lines.' % (filename,rje.iStr(ex),rje.iStr(px))) PILEOUT.close() PILEUP.close() ### ~ [3] Save QC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### QC = open('%s.%s.QC.tdt' % (self.baseFile(),tname),'w') QC.write('Qual\tCount\n') for q in range(len(qc)): try: QC.write('%d\t%d\n' % (q+1,qc[q])) except: self.errorLog('!') QC.close() return table except: self.errorLog('%s.parsePileup(%s) error' % (self,filename)); return None