def _setupOutput(self): ### Sets up output files self.str['MapFas','MissFas','MapRes'] '''Sets up output files self.str['MapFas','MissFas','MapRes'].''' ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### delimit = rje.getDelimit(self.cmd_list) if self.str['StartFrom'].lower() in ['','none']: self.str['StartFrom'] = '' else: self.bool['Append'] = True self.printLog('#CMD','StartFrom = "%s" so Append=T' % self.str['StartFrom']) ### ~ [1] General ResFile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### files = {'MapFas':'mapping.fas','MissFas':'missing.fas','MapRes':'mapping.%s' % rje.delimitExt(delimit)} if self.getBool('Combine'): files.pop('MissFas') if self.str['ResFile'].lower() in ['','none']: self.str['ResFile'] = '%s.%s' % (rje.baseFile(self.str['SeqIn']),rje.baseFile(self.str['MapDB'],strip_path=True)) for file in files.keys(): self.setStr({file: self.getStr('ResFile') + '.' + files[file]}) rje.backup(self,self.getStr(file)) ### ~ [2] Headers for MapRes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #!# Consider replacing with rje_db object? #!# self.list['Headers'] = ['Query','Hit','Method','MapRank','BlastRank','EVal','Score'] for qh in ['Query','Hit']: self.list['Headers'] += ['%s_Species' % qh] if self.bool['GablamOut']: for st in ['Len','Sim','ID']: self.list['Headers'] += ['%s_%s' % (qh,st)] rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],delimit)
def blast2fas(self): ### Executes BLAST2FAS and copies results files '''Executes BLAST2FAS and copies results files.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### need2blast = self.opt['Force'] null_file = '%s.blast2fas_null.txt' % self.baseFile(); nx = 0; null_list = [] if os.path.exists(null_file): null_list = string.split(open(null_file,'r').read(),'\n') self.debug(null_file) for seq in self.seqs(): if seq.info['AccNum'] in null_list: nx += 1; continue hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True) for db in self.obj['SeqList'].list['Blast2Fas']: self.debug(rje.isYounger(hfile,db)) self.debug(rje.isYounger(hfile,db) == hfile) need2blast = need2blast or not rje.isYounger(hfile,db) == hfile if not need2blast: self.printLog('#BLAST','All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx) return False ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.backup(self,null_file); nx = 0 if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('MultiCut'),'blastv=%d' % self.getInt('MultiCut')] elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += ['blastb=%d' % self.getInt('BlastCut'),'blastv=%d' % self.getInt('BlastCut')] if self.getInt('Forks'): self.obj['SeqList'].cmd_list += ['blasta=%d' % self.getInt('Forks')] rje_seq.Blast2Fas(self.obj['SeqList'],self.getStr('HAQBLASTDir')) for seq in self.seqs(): sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'),seq.info['AccNum']) if os.path.exists(sbfile): hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'],seq.info['AccNum']),wholepath=True) os.rename(sbfile,hfile) if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile)) if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile)) else: open(null_file,'a').write('%s\n' % seq.info['AccNum']); nx += 1 if nx: self.printLog('#BLAST','%s Accession Numbers without BLAST2Fas hits output to %s' % (nx,null_file)) self.printLog('#BLAST','%s HAQESAC input files made using BLAST2Fas' % (self.seqNum()-nx)) return True except: self.errorLog('Major problem with MultiHAQ.blast2fas'); raise
def setup(self): ### Main class setup method. Makes sumfile if necessary. '''Main class setup method. Makes sumfile if necessary.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile')) if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile'])) if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile() self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile')) if os.path.exists(self.info['SumFile']) and not self.opt['Force']: if rje.yesNo('%s found. Use these results?' % self.info['SumFile']): return self.printLog('#SUM','Summary results file found. No MASCOT processing.') mapgi = False ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for mfile in self.list['ResFiles']: bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile]) bud.info['Name'] = mfile bud.readMascot() self.dict['Searches'][mfile] = bud.dict['Hits'] protacclist = rje.sortKeys(bud.dict['Hits']) for protacc in protacclist: if rje.matchExp('gi\|(\d+)',protacc): mapgi = True accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile)) self.debug(accfile) open(accfile,'w').write(string.join(protacclist,'\n')) self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist))) ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #if mapgi: # mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI') # open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n')) ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list) self.dict['Acc2Seq'] = seqlist.seqNameDic('Max') ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',') rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True) for mfile in rje.sortKeys(self.dict['Searches']): bud = self.dict['Searches'][mfile] for protacc in rje.sortKeys(bud)[0:]: protname = bud[protacc]['prot_acc'] protdesc = bud[protacc]['prot_desc'] if rje.matchExp('gi\|(\d+)',protacc): gi = rje.matchExp('gi\|(\d+)',protacc)[0] try: protname = self.dict['Acc2Seq'][gi].shortName() protdesc = self.dict['Acc2Seq'][gi].info['Description'] except: protname = 'gi_UNK__%s' % gi #x#print protname, protdesc, bud[protacc] for pep in bud[protacc]['Peptides']: data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname, 'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']} rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data) except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.setup() if self.getBool('TaxTable'): self.setBool({'BatchMode':True}) ### ~ [2] ~ Single Mode ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.getBool('BatchMode'): return self.mapTaxa(self.list['TaxIn'],self.list['TaxOut'],self.getBool('NodeOnly'),self.getBool('RankOnly')) ### ~ [3] ~ Batch Mode ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getBool('TaxTable'): tdb = self.db().addEmptyTable('taxtable',['TaxIn']+self.list['TaxOut'],['TaxIn']) basefile = self.baseFile() for taxa in self.list['TaxIn'][0:]: self._cmdReadList('taxin=%s' % taxa,'list',['TaxIn']) # List of strings (split on commas or file lines) self.setBaseFile('%s.%s' % (basefile,rje.baseFile(taxa,strip_path=True))) taxdict = self.mapTaxa(self.list['TaxIn'],self.list['TaxOut'],self.getBool('NodeOnly'),self.getBool('RankOnly'),savetaxout=not self.getBool('TaxTable')) if self.getBool('TaxTable'): tentry = {'TaxIn':taxa} for tfield in taxdict: tentry[tfield] = string.join(taxdict[tfield],'|') tdb.addEntry(tentry) self.baseFile(basefile) if self.getBool('TaxTable'): tdb.saveToFile() return True except: self.errorLog(self.zen()) raise # Delete this if method error not terrible
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.db().addTable(filename=self.getStr('TDTFile'),mainkeys=self.list['TDTFields'],name='input',expect=True) if not self.baseFile(return_none=None): self.baseFile(rje.baseFile(self.getStr('TDTFile'))) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()); return False # Setup failed
def legacySetup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() ## Set Basefile if not self.basefile(return_none=None): self.basefile(rje.baseFile(self.getStr('OccFile'))) tabkeys = {'OccFile':['dataset','runid','motif','seq','start_pos','end_pos','variant'], 'DomFile':['domain','uniprot'], 'DMIFile':['motif','domain'], 'PPIFile':['hub','spoke']} ## Load Tables for dfile in ['DomFile','DMIFile','OccFile','PPIFile']: dbtable = db.addTable(self.getStr(dfile),mainkeys=tabkeys[dfile],name=dfile,expect=True,replace=False,uselower=True) self.tidyMotifNames(dbtable) if dfile == 'OccFile': #dbtable.addField('uniprot') dbtable.addField('gene') for entry in dbtable.entries(): #entry['uniprot'] = string.split(entry['seq'],'_')[-1] # Don't want this: uniprot is spoke! entry['gene'] = string.split(entry['seq'],'_')[0] elif dfile == 'DomFile': dbtable.compress(['domain','uniprot'],default='str') dbtable.keepFields(['domain','uniprot']) elif dfile == 'DMIFile': dbtable.compress(['motif','domain'],default='str') dbtable.keepFields(['motif','domain']) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()); return False # Setup failed
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [0] Setup File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStr('SaveDis').lower() in ['','none']: base = 'peptides' if rje.checkForFile(self.getStr('Peptides')): base = rje.baseFile(self.getStr('Peptides')) if self.baseFile().lower() not in ['','none']: base = self.baseFile() self.baseFile(base) self.setStr({'SaveDis':'%s.%s.%s' % (base,self.getStr('PeptDis'),self.getStr('PeptCluster'))}) if self.getStr('OutMatrix') in ['tdt','csv','png','phylip']: self.str['SaveDis'] += '.%s' % self.getStr('OutMatrix')[:3] else: self.str['SaveDis'] += '.txt' self.dict['Output']['peptides'] = string.join(self.list['Peptides'],'\n') ### ~ [1] Setup Distance Matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['AADis'] = rje_dismatrix.DisMatrix(self.log,['nsf2nwk=T']+self.cmd_list) self.obj['AADis'].info['Name'] = 'Pairwise AA distances' self.obj['PeptDis'] = rje_dismatrix.DisMatrix(self.log,['nsf2nwk=T']+self.cmd_list) self.obj['PeptDis'].info['Name'] = 'Pairwise peptide distances' ### ~ [2] Optional loading of AA Distance Matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStr('AADis').lower() not in ['','none']: self.obj['AADis'].loadMatrix(self.getStr('AADis')) else: self.obj['AAProp'] = aaprop = rje_aaprop.AAPropMatrix(self.log,self.cmd_list) #aaprop.readAAProp() # Does this on loading! for aa in aaprop.pdif: self.obj['AADis'].addDis(aa[0],aa[1],aaprop.pdif[aa]) return True except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def run(self,imenu=False,outputmap=True,returndict=False): ### Main controlling run Method ''' Main controlling run Method. >> imenu:boolean = Whether to initiate interactive menu if appropriate [False]. >> outputmap:boolean = Whether to output mapping into a file [True] >> returndict:boolean = Whether to return a dictionary of {searchname:mappedname} (no previous mapping) [False] ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.setup(imenu): raise ValueError seqlist = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=T','seqmode=file']) if not seqlist.seqNum(): self.warnLog('No sequences loaded for mapping.'); return {} ## ~ [0a] Setup BLAST Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## blast = rje_blast.BLASTRun(self.log,['blaste=1e-4','blastv=20','blastf=F']+self.cmd_list+['v=-1']) blast.setStr({'DBase':self.getStr('MapDB'),'Type':'blastp','InFile':self.getStr('SeqIn'), 'Name':'%s-%s.blast' % (rje.baseFile(self.str['SeqIn'],True),rje.baseFile(self.str['MapDB'],True))}) blast.setStat({'HitAln':blast.getStat('OneLine')}) blast.list['ResTab'] = ['Search','Hit','GABLAM'] if seqlist.nt(): blast.str['Type'] = 'blastx' ## ~ [0b] Setup Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if outputmap: self._setupOutput() ## Output Files ## if returndict: mapdict = {} else: self._setupMapped() ## Previously Mapped Sequences ## seqx = seqlist.seqNum() ## Number of sequences ## ### ~ [1] BLAST Search Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#BLAST','BLASTing %s vs %s.\n *** This could take some time if files are large. Please be patient! ***' % (self.str['SeqIn'],self.str['MapDB']),log=False) ## ~ [1a] Perform BLAST Unless it exists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## blast.run(format=True) self.obj['DB'] = blast.obj['DB'] ## ~ [1b] Mapping from searches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.debug(self.getStr('MapDB')) self.obj['MapDB'] = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=F','seqmode=file','seqin=%s' % self.str['MapDB']]) self.obj['MapDB'].loadSeq(self.getStr('MapDB')) self.debug('%s' % self.obj['MapDB'].list['Seq']) sx = 0 while seqlist.nextSeq() != None: search = seqlist.getSeq(format='short') sx += 1 ## Check StartFrom ## if self.str['StartFrom']: if self.str['StartFrom'] != search: self.progLog('\r#SKIP','Looking for %s: skipping %d seqs' % (self.str['StartFrom'],sx)) continue self.str['StartFrom'] = '' self.printLog('\r#SKIP','Starting from %s: skipped %d seqs' % (self.str['StartFrom'],sx)) ## Check if in Mapped ## if search in self.list['Mapped']: resdict = {'Query':search,'Hit':search,'Method':'Already Mapped!'} self.printLog('#FAS','%s already in output - not duplicating in %s' % (search,self.str['MapFas'])) rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict) continue ### Map Sequence ### self.printLog('#MAP','Mapping %s seqs: %s of %s' % (self.str['SeqIn'],rje.integerString(sx),rje.integerString(seqx))) mapname = self.mapSeq(seqlist,blast,search) if returndict: mapdict[search] = mapname ### ~ [2] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#MAP','Mapping of %s (%s seqs) complete.' % (self.str['SeqIn'],rje.integerString(seqx))) if os.path.exists(blast.str['Name']) and not (self.getBool('DeBug') or self.test()): os.unlink(blast.str['Name']) #!# Add option to keep BLAST! #!# if returndict: return mapdict except: self.errorLog('Error in SeqMapper.run()',printerror=True,quitchoice=True); raise
def combineSNPs(self): ### Calculates statistics of genetic differences from parsed PileUp Tables '''Calculates statistics of genetic differences from parsed PileUp Tables.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.list['SNPTables']: self.printLog('\r#SNP','No SNP tables to add.'); return False fdb = self.db().addTable(name='fdr',expect=True,mainkeys=['Locus','Pos']) fdb.remakeKeys() #!# Delete once tuple thing OK fdbkeys = fdb.dataKeys() self.debug(fdbkeys[:100]) snps = [] snppos = [] for snptable in self.list['SNPTables']: snps.append(self.db().addTable(snptable,name=rje.baseFile(snptable,True),expect=True,mainkeys=['Locus','Pos'])) snps[-1].addField('SNP',evalue="YES") self.debug(snps[-1].dataKeys()[:100]) snps[-1].remakeKeys() #!# Delete once tuple thing OK self.debug(snps[-1].dataKeys()[:100]) px = 0; ptot = snps[-1].entryNum(); sx = 0 for pos in snps[-1].dataKeys(): # This should be a (Locus,Pos) tuple self.progLog('\r#SNP','Scanning %s for extra SNP positions: %.2f%%' % (snps[-1].name(),px/ptot)); px += 100.0 if pos not in snppos + fdbkeys: snppos.append(pos); sx += 1 self.printLog('\r#SNP','Scanned %s for extra SNP positions: %s to add.' % (snps[-1].name(),rje.iStr(sx))) ## ~ [0a] Add missing data from other tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if snppos: SAMSIG = open('%s.pdiff.tdt' % self.baseFile(),'r'); px = 0; ptot = len(snppos); ix = 0 fline = SAMSIG.readline(); headers = rje.readDelimit(fline) fline = SAMSIG.readline() self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix))) while fline: data = rje.readDelimit(fline); ix += 1 if (data[0],data[1]) in snppos: entry = {'p.FDR':'-'} for i in range(len(data)): entry[headers[i]] = data[i] fdb.addEntry(entry); px += 1 snppos.remove((data[0],data[1])) self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix))) else: self.progLog('\r#SNP','%s/%s SNP positions added from %s PDiff filelines.' % (rje.iStr(px),rje.iStr(ptot),rje.iStr(ix))) if not snppos: break fline = SAMSIG.readline() SAMSIG.close() self.printLog('\r#SNP','%s/%s SNP positions added from PDiff file.' % (rje.iStr(px),rje.iStr(ptot))) else: self.printLog('\r#SNP','No SNP positions to add.'); return False ### ~ [1] Join Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### temp = fdb temp.makeField('#Locus#|#Pos#') for snptable in snps: snptable.makeField('#Locus#|#Pos#') newtemp = self.db().joinTables(name='newtemp',join=[(temp,'#Locus#|#Pos#'),(snptable,'#Locus#|#Pos#',['SNP'])],newkey=['Locus','Pos'],keeptable=True) self.printLog('#SNP','Added SNPs from %s' % snptable.name()) self.db().deleteTable(temp) temp = newtemp temp.renameField('SNP',snptable.name()) temp.setStr({'Name':'temp'}) temp.dropField('#Locus#|#Pos#') self.db().list['Tables'].append(temp) temp.setStr({'Name':'SNPs'}) temp.saveToFile() return temp except: self.errorLog('%s.pileUpStats() error' % (self)); return None
def batchRun(self,returnobj=False): ### Execute batch mode runs '''Execute batch mode runs.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### barg = self.getStrLC('BatchArg') if not barg: raise ValueError('Cannot use batchrun=FILELIST if batcharg=None.') batchfiles = self.list['BatchRun'][0:] self.list['BatchRun'] = [] # Avoid recursive running! blog = self.getStr('BatchLog') if not blog.startswith('.'): blog = '.%s' % blog if not blog.endswith('.log'): blog = '%s.log' % blog rawcmd = self.cmd_list[0:] rawlog = self.log batchobj = [] ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### bx = 0 for bfile in batchfiles: bx += 1 self.printLog('#BATCH','Batch running %s of %s: %s=%s' % (rje.iStr(bx),rje.iLen(batchfiles),barg,bfile)) ## Setup parameters bbase = rje.baseFile(bfile,strip_path=True) bcmd = ['%s=%s' % (barg,bfile)] if self.getBool('BatchBase'): if blog == '.log': bcmd += ['basefile=%s' % bbase] else: bcmd += ['basefile=%s%s' % (bbase,rje.baseFile(blog))] elif self.getStrLC('BatchLog'): bcmd += ['log=%s%s' % (bbase,blog)] else: bcmd += ['newlog=F'] #self.debug(bcmd) ## Setup Seqsuite object self.cmd_list = rawcmd + bcmd self.log = rje.setLog(self.log.obj['Info'],self,self.cmd_list) # Sets up Log object for controlling log file output ## Run batchobj.append(self.run()) ## Finish and Tidy self.log = rawlog runobj = batchobj[-1] if runobj: if not returnobj: batchobj[-1] = True info = runobj.log.obj['Info'] self.printLog('#RUN','%s V%s run finished.' % (info.program,info.version)) else: self.warnLog('Batch run failed (%s=%s).' % (barg,bfile)) ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### failx = batchobj.count(False) self.printLog('#BATCH','%s batch runs complete: %s failed.' % (rje.iLen(batchfiles),rje.iStr(failx))) self.list['BatchRun'] = batchfiles return batchobj except: self.errorLog('%s.batchRun error' % self); return False
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup Database ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log,self.cmd_list) db = self.db().addEmptyTable('ProDigIS',['AccNum','Protease','PepCount'],['AccNum','Protease']) if self.getInt('MinPepLen') > 0: db.addField('MinPepLen') if self.getBool('NRPep'): db.addField('NRPep') if rje.exists(self.getStr('Source')): fdb = self.db().addTable(self.getStr('Source'),mainkeys=['AccNum'],name='Source') fdb.addField('File') fdb.addField('ProtMWt') else: fdb = self.db().addEmptyTable('Source',['AccNum','File','ProtMWt'],['AccNum']) for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i) if self.getBool('PepMWt'): for i in range(1,self.getInt('MaxPepLen')+1): db.addField(i*100.0) ### ~ [2] Load Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autoload=F']) self.obj['SeqList'].seq = fullseq = [] for seqfile in self.list['SeqFiles']: file = rje.baseFile(seqfile,True) seqlist = rje_seq.SeqList(self.log,['autofilter=T','gnspacc=T','seqnr=F']+self.cmd_list+['seqin=%s' % seqfile,'autoload=T']) fullseq += seqlist.seqs() for seq in seqlist.seqs(): accnum = seq.getStr('AccNum') try: entry = fdb.data()[accnum] if 'File' in entry and entry['File']: self.errorLog('%s found in %s AND %s!' % (accnum,entry['File'],file),printerror=False) entry['File'] = file entry['ProtMWt'] = seq.MWt() except: entry = {'AccNum':accnum,'File':file,'ProtMWt':seq.MWt()} fdb.addEntry(entry) self.deBug(fdb.dict['Data'][seq.getStr('AccNum')]) self.printLog('#SEQ','%s sequences to analyse in total' % rje.iLen(fullseq)) fdb.fillBlanks() ### ~ [3] Setup Peptide Probabilities ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self._peptideProbabilities(): db.addField('LenExp','PepCount'); if self.getBool('PepMWt'): db.addField('MWtExp','LenExp'); db.addField('Len7Exp','MWtExp') else: db.addField('Len7Exp','LenExp') db.addField('Len37','Len7Exp') if self.getBool('PepMWt'): db.addField('Len5','MWtExp'); db.addField('MWt5','Len5') db.addField('Len3','MWtExp'); db.addField('MWt3','Len3') else: db.addField('Len5','LenExp'); db.addField('Len3','LenExp') return ### ~ [4] Temp GABLAM Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdb = self.db().addTable('Chlam_Pos.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GABLAM') ndb = self.db().addTable('Chlam_Neg.vs.embl_bacteria.hitsum.tdt',['Qry'],name='GNeg') self.db().mergeTables(gdb,ndb,overwrite=True,matchfields=True) gdb.renameField('Qry','AccNum') tmp = self.db().joinTables(name='blast',join=[('Source','AccNum'),('GABLAM','AccNum')],newkey=['AccNum','File'],keeptable=False) tmp.saveToFile() tmp.compress(['File'],default='mean') tmp.dropFields(['AccNum']) tmp.info['Name'] = 'blastsum' tmp.saveToFile() except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def run(self,batch=False): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not batch: self.setupResults() ## ~ [1b] ~ Batch run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not batch and not self.obj['SeqList'].seqs(): ### Look for batch files and run for each batchfiles = rje.getFileList(self,filelist=self.list['Batch'],subfolders=False,summary=True,filecount=0) self.printLog('\r#FILES','Getting files: %5s files for batch run' % rje.integerString(len(batchfiles))) if not batchfiles: self.errorLog('No input files found!',printerror=False) else: bx = 0 for infile in batchfiles: bx += 1 self.printLog('#BATCH','Batch running %s' % infile) bcmd = ['query=1']+self.cmd_list+['autoload=T','seqin=%s' % infile] self.obj['SeqList'] = rje_seq.SeqList(self.log,bcmd) self.run(batch=True) self.opt['Append'] = True self.printLog('#BATCH','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(bx),rje.integerString(len(batchfiles)-bx)),log=False) if self.opt['Win32'] and len(sys.argv) < 2: self.verbose(0,0,'Finished!',1) # Optional pause for win32 return ## ~ [1c] ~ Special run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.info['Special'].lower() == 'allbyall': self.printLog('#RUN','Performing special "all-by-all" pairwise run') self.info['Special'] = '' for i in range(len(self.seqs())-1): self.obj['SeqList'].obj['QuerySeq'] = self.seqs()[i] for j in range(i+1,len(self.seqs())): self.info['Fitness'] = self.info['Phenotype'] = '%d' % (j + 1) self.run(batch=True) self.opt['Append'] = True self.info['Special'] = 'allbyall'; return ## ~ [1d] ~ General setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.setup() ### ~ [2] ~ Price calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.fitness() self.phenotype() self.grouping() for vector in ['Fitness','Phenotype','SeqGroup']: if len(self.list[vector]) != self.qry().seqLen(): self.errorLog('%s vector length (%s) does not match %s sequence length (%s)' % (vector,len(self.list[vector]),self.qry().seqLen()),printerror=False) raise ValueError results = self.price() ### ~ [3] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### results['Dataset'] = rje.baseFile(self.obj['SeqList'].info['Name'],True) results['Query'] = self.qry().shortName() results['Fitness'] = self.info['Fmethod'] results['Phenotype'] = self.info['Pmethod'] results['SeqGroup'] = self.info['SeqGroup'] rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],datadict=results) self.printLog('#OUT','Results output to %s' % self.info['ResFile']) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def parse(self): ### Parse REST file into dictionaries '''Parse REST file into dictionaries.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['RestKeys'] = [] rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True)) if rje.exists(self.getStr('RestIn')): restin = open(self.getStr('RestIn'),'r').read() elif rje.matchExp('^(\d+)$',self.getStr('RestIn')): url = '%sretrieve&jobid=%s&password=%s' % (self.getStr('RestURL'),self.getStr('RestIn'),self.getStr('Password')) if self.getBool('PureAPI') and self.getStrLC('Rest'): url += '&rest=%s' % (self.getStr('Rest')) else: url += '&rest=full' restin = urllib2.urlopen(url).read() if self.getBool('PureAPI'): return restin else: raise IOError('%s not found!' % self.getStr('RestIn')) jobid = None ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for restdata in string.split(restin,'###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###\n'): if not jobid: self.dict['Output']['intro'] = restdata prog = rje.matchExp('Output for (\S+)',restdata)[0] self.dict['Output']['prog'] = prog jobid = rje.matchExp('JobID: (\d+)',restdata)[0] self.dict['Output']['jobid'] = jobid if not self.getStrLC('RestBase'): rbase = '%s%s' % (self.getStr('RestOutDir'),jobid) self.dict['Outfile']['jobid'] = '%s.jobid' % (rbase) continue restlines = string.split(restdata,'\n') rparse = string.split(restlines.pop(0)) if rparse[0] != '#': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue if rparse[1][-1] != ':': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue rkey = rparse[1][:-1] try: rfile = '%s.%s' % (rbase,rje.baseFile(rparse[2],strip_path=True,keepext=True)) except: rfile = '' if not rfile: rfile = '%s.%s' % (rbase,rkey) rfile = string.replace(rfile,'%s.%s.' % (jobid,jobid),'%s.' % jobid) self.dict['Output'][rkey] = string.join(restlines,'\n') self.dict['Outfile'][rkey] = rfile self.list['RestKeys'].append(rkey) self.printLog('#PARSE','Parsed %s: %d REST outputs.' % (self.getStr('RestIn'),len(self.dict['Output']))) return True except: self.errorLog('%s.parse error' % self); return False
def hmmSearch(self,hmm,dbase=None,outfile=None,wait=True): ### Performs HMMer Search using object attributes ''' Performs HMMer Search using object attributes. >> hmm:str = Name of HMM file >> dbase:str = Name of DBase file [self.info['SearchDB']] >> outfile:str = Name of Output file file [self.info['HMMOut']] >> wait:boolean = whether to wait for HMMer. [True] << returns outfile or None if fails ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.checkForFile(hmm): self.printLog('#ERR','HMM file %s is missing!' % hmm); return None if not dbase: dbase = self.info['SearchDB'] if not rje.checkForFile(dbase): self.printLog('#ERR','Database file "%s" is missing!' % dbase); return None ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not outfile or outfile.lower() in ['','none']: # Make an outfile per search outfile = '%s.%s.hmmer' % (rje.baseFile(hmm,True),rje.baseFile(dbase,True)) resfile = outfile if not os.path.exists(outfile) and self.opt['GZip'] and os.path.exists('%s.gz' % outfile) and not self.opt['Force']: resfile = '%s.gz' % outfile if not self.opt['Force'] and rje.isYounger(resfile,hmm) == resfile and rje.isYounger(resfile,dbase) == resfile: self.printLog('#HMM','HMM results file "%s" exists.' % resfile) return outfile # Already exists else: rje.backup(self,outfile,unlink=True) ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['HMMPFam']: _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile) else: _command = 'hmmsearch %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile) self.log.printLog('#HMM',_command) if not wait: os.system(self.info['HMMerPath'] + _command + ' &') elif not os.path.exists(outfile) or self.opt['Force']: open(outfile,'a').write(os.popen(self.info['HMMerPath'] + _command).read()) self.printLog('#HMM','Outfile produced for %s: %s.' % (hmm,outfile)) if self.opt['GZip']: rje.backup(self,'%s.gz' % outfile,unlink=True) os.system('gzip %s' % outfile) self.printLog('#GZIP','%s gzipped to save space' % outfile) return outfile except: self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm) return None
def setup(self,gtext=''): ### Main class setup method. gtext will over-ride input file. '''Main class setup method. gtext will over-ride input file.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['HTML'] = rje_html.HTML(self.log,self.cmd_list) ## ~ [1a] File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('InFile'))) if self.getStr('OutFile').lower() in ['','none']: self.str['OutFile'] = '%s.html' % self.basefile() ## ~ [1b] Read in Glossary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## interms = [] if gtext: delimit = self.getStr('TermSplit') if delimit.lower() == 'tab': delimit = '\t' if delimit.lower() == 'space': delimit = ' ' if delimit.lower() == 'comma': delimit = ',' if delimit.lower() == 'period (.)': delimit = '.' if delimit.lower() == 'colon': delimit = ':' glossary = {} for line in string.split(gtext,'\n'): splitline = string.split(line,delimit) if delimit == '.' and (splitline[-1] in ['',' ']): splitline = splitline[:-1] if not splitline: continue (term,definition) = (splitline[0],string.join(splitline[1:],delimit)) if term == 'Term' and not glossary: continue if term: glossary[term] = {'Definition':definition} interms.append(term) else: try: if not self.getBool('KeepOrder') and open(self.getStr('InFile'),'r').readline()[:4] == 'Term': glossary = rje.dataDict(self,self.getStr('InFile'),mainkeys=['Term'],datakeys=['Term','Definition']) else: return self.setup(open(self.getStr('InFile'),'r').read()) except: self.errorLog('Problem reading input as dataDict(). Will try as text.') return self.setup(open(self.getStr('InFile'),'r').read()) if self.list['Terms']: for term in glossary: if term not in self.list['Terms']: glossary.pop(term) elif self.getBool('KeepOrder'): self.list['Terms'] = interms else: self.list['Terms'] = rje.sortKeys(glossary) for term in glossary: glossary[term] = glossary[term]['Definition'] ### ~ [2] Create Full Glossary Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nested = {} for term in glossary: tdict = nested for word in string.split(term.lower()): if word not in tdict: tdict[word] = {} tdict = tdict[word] tdict['='] = glossary[term] self.dict['Glossary'] = nested return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = self.obj['SeqList'] if self.getStr('Basefile').lower() in ['','none']: self.str['Basefile'] = rje.baseFile(seqlist.getStr('Name')) self.obj['DB'].setInfo({'Basefile':self.str['Basefile']}) ## ~ [1a] Genetic Code ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## cdb = self.db().addEmptyTable('Code',['Codon','AA'],['Codon']) for codon in rje_sequence.genetic_code: cdb.addEntry({'Codon':codon,'AA':rje_sequence.genetic_code[codon]}) cdb.index('AA') ### ~ [2] Calculate Codon Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### codons = rje.sortKeys(rje_sequence.genetic_code) db = self.db().addEmptyTable('Codons',['Seq','Len']+codons,['Seq']) sx = 0.0; seqx = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#COD','Calculating codon usage: %.2f%%' % (sx/seqx)); sx += 100.0 entry = rje_sequence.codons(seq.getSequence(),{}) #self.deBug(entry); self.deBug(entry.values()) entry['Len'] = sum(entry.values()) entry['Seq'] = seq.getStr('AccNum') db.addEntry(entry) self.printLog('\r#COD','Codon usage calculated for %s sequences' % rje.iStr(seqx)) db.fillBlanks(blank=0,fillempty=True) db.saveToFile() ### ~ [3] Calculate NT Count Tables ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nt = ['C','A','G','U'] for i in [1,2,3]: for n in ['C','A','G','U']: nt.append('%s|%d' % (n,i)) ndb = self.db().addEmptyTable('NT',['Seq','Len']+nt,['Seq']) sx = 0.0; seqx = seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#NT','Calculating NT Counts: %.2f%%' % (sx/seqx)); sx += 100.0 entry = rje_sequence.aaFreq(string.replace(seq.getSequence(),'T','U'),{'C':0,'A':0,'G':0,'U':0},False) entry['Len'] = sum(entry.values()) entry['Seq'] = seq.getStr('AccNum') centry = db.data(entry['Seq']) for i in [1,2,3]: for n in ['C','A','G','U']: entry['%s|%d' % (n,i)] = 0 for codon in codons: for i in [1,2,3]: n = codon[i-1] entry['%s|%d' % (n,i)] += centry[codon] ndb.addEntry(entry) self.printLog('\r#NT','NT Counts calculated for %s sequences' % rje.iStr(seqx)) ndb.saveToFile() except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def runJobs(self): ### Runs all the jobs in self.list['SubJobs'] #V1.0 '''Runs all the jobs in self.list['SubJobs'].''' ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for j in range(self.getInt('KeepFree'),self.nprocs()): self.nextJob(j) # Skip first node(s) pidcheck = '%s.pid' % rje.baseFile(self.log.info['LogFile']) ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while self.dict['Running']: PIDCHECK = open(pidcheck,'w') for j in rje.sortKeys(self.dict['Running']): if not self.dict['Running'][j]: self.dict['Running'].pop(j); continue # No more jobs try: pid = self.dict['Running'][j]['PID'] PIDCHECK.write('%s: %s\n' % (j,pid)) if string.split('%s' % pid)[0] == 'WAIT': status = 1 else: (status,exit_stat) = os.waitpid(pid,os.WNOHANG) except: status = 1 if status > 0: self.endJob(j) # subjob on processor j has finished: can replace with processing PIDCHECK.close() time.sleep(self.getInt('SubSleep'))
def peptCluster(self): ### Performs actual peptide clustering and stores results in self.obj['Tree'] '''Performs actual peptide clustering and stores results in self.obj['Tree'].''' try:### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### base = rje.baseFile(self.getStr('SaveDis')) pretree = ['treeformats=nwk,text','basefile=%s' % base] ### ~ [1] ~ Phylip Neighbor method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStr('PeptCluster') == 'neighbor': disfile = '%s.phy' % base fasfile = '%s.fas' % base treecmd = ['autoload=T','maketree=neighbor','disin=%s' % disfile,'seqin=%s' % fasfile] pretree += ['root=mid'] if disfile != self.getStr('SaveDis'): rje.backup(self,disfile) self.obj['PeptDis'].saveMatrix(filename=disfile,format='phylip') ### Saves matrix if 'peptides=%s' % fasfile not in self.cmd_list: rje.backup(self,fasfile) FAS = open(fasfile,'w') for pep in self.list['Peptides']: FAS.write('>%s\n%s\n' % (pep,pep)) FAS.close() tree = self.obj['Tree'] = rje_tree.Tree(self.log,pretree+self.cmd_list+treecmd) ### ~ [2] ~ UPGMA method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### else: if self.getStr('PeptCluster') not in ['wpgma','upgma']: self.errorLog('PeptCluster method "%s" not recognised. Will use UPGMA' % self.getStr('PeptCluster'),printerror=False) base = string.replace(base,self.getStr('PeptCluster'),'upgma') pretree += ['basefile=%s' % base] if self.getStr('PeptCluster') == 'upgma': nsftree = self.obj['PeptDis'].upgma() elif self.getStr('PeptCluster') == 'wpgma': nsftree = self.obj['PeptDis'].wpgma() #nwkfile = '%s.nwk' % base #treecmd += ['nsfin=%s' % nwkfile] #rje.backup(self,nwkfile) #open(nwkfile,'w').write(nsftree) treecmd = ['autoload=F'] tree = self.obj['Tree'] = rje_tree.Tree(self.log,pretree+self.cmd_list+treecmd) tree.buildTree(nsftree) ### ~ [3] ~ Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for node in tree.node: if node.info['Name'] in self.list['Peptides']: node.stat['ID'] = self.list['Peptides'].index(node.info['Name']) + 1 tree.saveTrees() for outfmt in tree.list['TreeFormats']: treefile = '%s.%s' % (tree.info['Basefile'],rje_tree.formatext[outfmt]) self.dict['Output'][outfmt] = treefile except: self.errorLog('%s.peptDis error' % self);
def legacySetup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() ## Set Basefile if not self.basefile(return_none=None): self.basefile(rje.baseFile(self.getStr('OccFile'))) tabkeys = { 'OccFile': [ 'dataset', 'runid', 'motif', 'seq', 'start_pos', 'end_pos', 'variant' ], 'DomFile': ['domain', 'uniprot'], 'DMIFile': ['motif', 'domain'], 'PPIFile': ['hub', 'spoke'] } ## Load Tables for dfile in ['DomFile', 'DMIFile', 'OccFile', 'PPIFile']: dbtable = db.addTable(self.getStr(dfile), mainkeys=tabkeys[dfile], name=dfile, expect=True, replace=False, uselower=True) self.tidyMotifNames(dbtable) if dfile == 'OccFile': #dbtable.addField('uniprot') dbtable.addField('gene') for entry in dbtable.entries(): #entry['uniprot'] = string.split(entry['seq'],'_')[-1] # Don't want this: uniprot is spoke! entry['gene'] = string.split(entry['seq'], '_')[0] elif dfile == 'DomFile': dbtable.compress(['domain', 'uniprot'], default='str') dbtable.keepFields(['domain', 'uniprot']) elif dfile == 'DMIFile': dbtable.compress(['motif', 'domain'], default='str') dbtable.keepFields(['motif', 'domain']) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()) return False # Setup failed
def setup(self): ### Main class setup method. ''' Main class setup method. This will load sequences into a SeqList object, gaps into a 'gaps' database table, and check or generate a PAF file from the mapped long reads. ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log, self.cmd_list) if not self.getStrLC('SeqIn'): raise ValueError('seqin=FILE must be set') if not rje.exists(self.getStr('SeqIn')): raise IOError('Unable to read seqin=FILE: "{0}"'.format( self.getStr('SeqIn'))) seqbase = rje.baseFile(self.getStr('SeqIn'), strip_path=True) if not self.getStrLC('Basefile'): self.baseFile(seqbase) if rje.checkForFiles(filelist=['.gaps.tdt'], basename=seqbase, log=self.log) and not self.force(): self.cmd_list.append('gapstats=F') else: self.cmd_list.append('gapstats=T') seqin = self.seqinObj() gapdb = self.db().addTable('%s.gaps.tdt' % seqbase, mainkeys=['seqname', 'start', 'end'], name='gaps', ignore=[], expect=True) gapdb.dataFormat({'start': 'int', 'end': 'int'}) ### ~ [2] PAF File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.getStrLC('PAF'): self.setStr({'PAF': self.baseFile() + '.paf'}) pfile = self.getStr('PAF') if self.force() or not rje.exists(pfile): paf = rje_paf.PAF(self.log, self.cmd_list) paf.longreadMinimapPAF(pfile) if not rje.exists(self.getStr('PAF')): raise IOError( 'Unable to read or create PAF file: {0}'.format(pfile)) return True except: self.errorLog('Problem during %s setup.' % self.prog()) return False # Setup failed
def buildHMM(self,seqfile,hmmfile=None): ### Makes an HMM from a sequence alignment file ''' Makes an HMM from a sequence alignment file. >> seqfile:str = Name of sequence file >> hmmfile:str = Name of HMM file [*.hmm] << hmmfile if made, None if failed. ''' try: ### Setup ### _hmmpath = self.info['HMMerPath'] if not hmmfile: hmmfile = '%s.hmm' % rje.baseFile(seqfile) ### Build HMM ## os.system('%shmmbuild %s %s' % (_hmmpath,hmmfile,seqfile)) if self.opt['HMMCalibrate']: os.system('%shmmcalibrate %s' % (_hmmpath,hmmfile)) return hmmfile #!# Add error catching during build/calibrate (How?!) #!# except: self.log.errorLog('Oh my, what a calamity during buildHMM(%s)!' % seqfile) return None
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Read in Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqfile = self.getStr('SeqIn') seqs = rje_seq.SeqList(log=self.log,cmd_list=['i=0']+self.cmd_list+['autofilter=F','autoload=F','seqin=None']) self.printLog('#SEQS','Loading sequences from %s' % seqfile) if not seqs.loadSeqs(seqfile=seqfile,seqtype='protein',aln=True): raise IOError('Cannot load from %s' % seqfile) seqfile = seqs.info['Name'] basefile = rje.baseFile(seqfile) if not self.getStrLC('Basefile'): self.baseFile(basefile) self.printLog('#SEQ',"%s protein sequences read from %s\n" % (str(seqs.seqNum()),seqfile),1) #?# Add option to generate alignment? self.printLog('#SEQ',"Alignment = %s. (%d aa)\n" % (seqs.opt['Aligned'],seqs.seq[0].seqLen()),1) self.dict['Output']['seqin'] = seqfile ### ~ [1] Read in Tree ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStrLC('NSFIn'): nsfin = self.getStr('NSFIn') else: nsfin = basefile + '.nsf' while not os.path.exists(nsfin): if self.i() >= 0: nsfin = rje.choice(text='Input tree file "%s" not found. Input filename? (Blank to exit.)' % nsfin) if nsfin == '': raise KeyboardInterrupt else: raise IOError('File %s not found. Cannot load tree!' % nsfin) self.dict['Output']['nsfin'] = nsfin self.cmd_list.append('nsfin=' + nsfin) self.printLog('#TREE','Loading tree from %s' % nsfin) self.obj['Tree'] = mytree = rje_tree.Tree(log=self.log,cmd_list=['root=yes']+self.cmd_list) mytree.mapSeq(seqlist=seqs) mytree.textTree() if mytree.opt['ReRooted']: mytree.saveTree(filename='%s.nsf' % basefile) return True # Setup successful except KeyboardInterrupt: self.printLog('#CANCEL','User terminated.'); return False except: self.errorLog('Problem during %s setup.' % self.prog()); return False # Setup failed
def classify(self): ### Generate summary tables for each protein class '''Generate summary tables for each protein class.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() rankdb = self.db('taxamap') for cfile in self.list['Classify']: pclass = rje.baseFile(cfile,strip_path=True) clist = [] for fline in open(cfile,'r').readlines(): prot = string.split(rje.chomp(fline),maxsplit=1)[0] if prot: clist.append(prot) self.printLog('#CLASS','%s "%s" class proteins read from %s' % (rje.iLen(clist),pclass,cfile)) if not clist: self.warnLog('No proteins read from %s' % (cfile)) continue classdb = db.copyTable(rankdb,pclass) classdb.dropEntriesDirect('protein',clist,inverse=True) if not classdb.entries(): self.warnLog('No "%s" proteins found in TaxaMap table' % (pclass)) continue self.summaryScores(classdb,pclass,'MinClass') except: self.errorLog('%s.classify() error' % self.prog())
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Sequence file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list) #!# Add code for memsaver/autoload=F #!# self.printLog('#SCAP','%s sequences loaded for SCAP analysis' % rje.integerString(seqlist.seqNum())) ## ~ [1b] ~ Xmer background file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mseqfile = self.info['XmerBack'] if mseqfile.lower() in ['','none']: mseqfile = self.info['XmerBack'] = seqlist.info['Name'] markov = self.obj['Markov'] = rje_markov.Markov(self.log,['autoload=T','accnr=F','seqnr=F']+self.cmd_list+['seqin=%s' % mseqfile,'direction=both','markov=F','scap=T']) markov.setup() maxx = markov.stat['MaxXmer'] if self.info['Basefile'].lower() in ['','none']: self.info['Basefile'] = '%s.scap' % rje.baseFile(seqlist.info['Name'],True) if markov.opt['Sorted']: self.info['Basefile'] = '%s.sorted' % self.info['Basefile'] basefile = self.info['Basefile'] self.printLog('#MARKOV','Markov setup complete') ## ~ [1c] ~ SCAP Background file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scapfile = self.info['ScapBack'] if scapfile.lower() in ['','none',seqlist.info['Name'].lower()]: self.obj['ScapBack'] = self.obj['SeqList'] elif scapfile == mseqfile: self.obj['ScapBack'] = markov.obj['SeqList'] else: self.obj['ScapBack'] = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list+['seqin=%s' % scapfile]) self.printLog('#SCAP','%s sequences for SCAP Background' % rje.integerString(seqlist.seqNum())) ### ~ [2] Markov Chains ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if mseqfile == seqlist.info['Name']: markov.obj['SeqList'] = seqlist elif mseqfile == self.obj['ScapBack'].info['Name']: markov.obj['SeqList'] = self.obj['ScapBack'] mpickle = markov.unpickleMe() if mpickle: markov = self.obj['Markov'] = mpickle if not markov.suftree() or not markov.pretree() or maxx > markov.stat['MaxXmer']: markov.run() markov.pickleMe() markov.opt['DeBug'] = self.opt['DeBug'] self.deBug(markov.opt) self.deBug(markov.stat) #self.deBug(markov.suftree()) #self.deBug(markov.pretree()) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def rmdKnit(self, rmdfile, document='html', stdout=False): ### Knit Rmd to HTML/PDF file ''' Knit Rmd to HTML/PDF file. >> rmdfile:str = R markdown file to knit >> document:str ['html'] = type of document to knit into << success:bool = whether output is generated ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outfile = '%s.%s' % (rje.baseFile(rmdfile), document) rcmd = 'Rscript -e \'library(rmarkdown); rmarkdown::render("%s", "%s_document")\'' % ( rmdfile, document) self.printLog('#RCMD', rcmd) rcmd += ' 2>&1' if self.v() < 2 and not stdout: os.popen(rcmd).read() else: self.progLog('#RCMD', 'Knitting %s...' % (rmdfile)) os.system(rcmd) success = rje.exists(outfile) if success: self.printLog('#RCMD', '%s generated from %s' % (outfile, rmdfile)) else: self.printLog( '#SYS', 'If pandoc error, try setting global variable: export RSTUDIO_PANDOC=/Applications/RStudio.app/Contents/MacOS/pandoc' ) self.printLog( '#SYS', 'If no pandoc error, check that required libraries in %s are installed' % rmdfile) raise IOError('%s not created' % outfile) return True except: self.errorLog('%s.rmdKnit error: check R installation' % self.prog()) return False
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] Setup SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['SeqList'] = rje_seq.SeqList( self.log, ['keepblast=T'] + self.cmd_list + ['autofilter=F', 'align=F', 'haqbat=None']) self.obj['SeqList']._checkForDup(True) if not self.seqNum(): self.errorLog('No sequences loaded!', printerror=False) return False if self.opt['AddQueries'] and self.name( ) not in self.obj['SeqList'].list['Blast2Fas']: self.obj['SeqList'].list['Blast2Fas'].append(self.name()) ### ~ [2] Setup Results Directory ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['HaqDir'].lower() in ['', 'none']: self.info['HaqDir'] = '%s_HAQESAC/' % rje.baseFile( self.name(), strip_path=True) rje.mkDir(self, self.info['HaqDir']) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self) return False # Setup failed
def forking(self): ### Keeps forking out and processing jobs until no more jobs in self.list['Forked']. '''Keeps forking out and processing jobs until no more jobs in self.list['Forked'].''' ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getBool('PIDCheck') or self.dev(): pidcheck = '%s.pid' % rje.baseFile(self.log.info['LogFile']) # Set *.pid object to match log else: pidcheck = False #self.deBug(pidcheck) ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while self.list['Forked']: if pidcheck: PIDCHECK = open(pidcheck,'w') for fdict in self.list['Forked'][0:]: try: pid = fdict['PID'] if pidcheck: PIDCHECK.write('%s: %s\n' % (self.list['Forked'].index(fdict),pid)) if string.split('%s' % pid)[0] == 'WAIT': status = 1 else: (status,exit_stat) = os.waitpid(pid,os.WNOHANG) except: self.errorLog('!') status = 1 if status > 0: self.list['Forked'].remove(fdict) self.endFork(fdict) # Fork has finished: can replace with processing if pidcheck: PIDCHECK.close() #self.deBug(open(pidcheck,'r').read()) ## ~ [2a] Look for eternal hanging of threads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if time.time() - self.getNum('KillTime') > self.getNum('KillForks'): self.verbose(0,1,'\n%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'),len(self.list['Forked'])),1) for fdict in self.list['Forked']: self.verbose(0,2,' => Fork %s, PID %d still Active!' % (fdict['ID'],fdict['PID']),1) if self.i() < 0 or rje.yesNo('Kill Main Thread?'): raise ValueError('%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'),len(self.list['Forked']))) elif rje.yesNo('Kill hanging forks?'): for fdict in self.list['Forked']: self.printLog('#KILL','Killing Fork %s, PID %d.' % (fdict['ID'],fdict['PID'])) os.system('kill %d' % fdict['PID']) else: self.setNum({'KillTime':time.time()}) ## ~ [2b] Sleep ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## time.sleep(self.getNum('ForkSleep'))
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### fixfields = ['Location','Name','Artist','Composer','Album'] db = self.obj['DB'] = rje_db.Database(self.log,self.cmd_list) #self.deBug(self.list['iTunes']) for ifile in self.list['iTunes']: #self.deBug(string.split(open(ifile,'r').readline(),'\t')) idb = db.addTable(ifile,mainkeys=['Location'],name=rje.baseFile(ifile,True)) for field in iformat: if iformat[field] == 'del' and field in idb.fields(): idb.dropField(field) idb.dataFormat(iformat) idb.addField('Album_Artist','Album') idb.addField('Tracks',evalue=1) if self.getBool('AddScore'): idb.addField('Score',evalue=0) for entry in idb.entries(): for field in fixfields: newval = '' for x in entry[field]: if x.isalnum() or x in '\\/: -_()[].~$': newval += x entry[field] = newval entry['Album_Artist'] = entry['Artist'] try: for divider in ['\\\\','\\',':','/']: if len(string.split(entry['Location'],divider)) > 2: entry['Album_Artist'] = string.split(entry['Location'],divider)[-3] break except: self.errorLog('!') self.deBug(entry['Location']) if not entry['Plays']: entry['Plays'] = 0 if not entry['Skips']: entry['Skips'] = 0 if self.getBool('AddScore'): if entry['My Rating']: entry['Score'] = (entry['My Rating'] - 60) / 20.0 idb.remakeKeys() return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def hmmTable(self,outfile='',append=False,delimit=None): ### Outputs results table ''' Outputs results table. >> outfile:str = Name of output file >> append:boolean = whether to append file >> delimit:str = Delimiter to use [\t] ''' try: ### Setup ### if not outfile: outfile = self.info['HMMTab'] if outfile.lower() == 'none': self.log.printLog('#TAB','HMMTab = "None": No table output') return False if not delimit: delimit = rje.getDelimit(self.cmd_list,'\t') if not outfile: outfile = '%s.hmmer.%s' % (rje.baseFile(self.info['SearchDB'],True),rje.delimitExt(delimit)) self.readResults() self.log.printLog('#TAB','Tabulating results for %s searches into %s' % (len(self.search),outfile),log=False) ### Setup Resfile ### if self.opt['MySQL']: headers = ['HMM','Hit','Hit_Start','Hit_End','Eval','Score'] else: headers = ['Type','Name','Start','End','Eval','Score'] if not append or not os.path.exists(outfile): rje.delimitedFileOutput(self,outfile,headers,delimit,rje_backup=True) ### Output Search details ### for search in self.search: for hit in search.hit: for aln in hit.aln: out = {'HMM':search.info['Name'],'Type':search.info['Name'], 'Name':hit.info['Name'],'Hit':hit.info['Name'], 'Start':'%d' % aln.stat['SbjStart'], 'End':'%d' % aln.stat['SbjEnd'], 'Hit_Start':'%d' % aln.stat['SbjStart'], 'Hit_End':'%d' % aln.stat['SbjEnd'], 'Eval':'%.2e' % aln.stat['Expect'],'Score':'%.1f' % aln.stat['BitScore']} rje.delimitedFileOutput(self,outfile,headers,delimit,out) self.log.printLog('#OUT','Results for %s searches output to %s.' % (len(self.search),outfile)) except: self.log.errorLog('Fatal Error during hmmTable(%s).' % outfile) raise
def save(self): ### Saves parsed REST output to files '''Saves parsed REST output to files.''' rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True)) rje.mkDir(self,self.getStr('RestOutDir')) outputs = rje.sortKeys(self.dict['Output']) if self.getStrLC('Rest') in outputs: outputs = [self.getStrLC('Rest')] elif self.getStrLC('Rest') in ['full','text']: outfile = '%s.rest' % rbase open(outfile,'w').write(self.restFullOutput()) self.printLog('#OUT','%s: %s' % (self.getStrLC('Rest'),outfile)) return True elif self.getStrLC('Rest'): self.printLog('#OUTFMT','REST output format "%s" not recognised.' % self.getStrLC('Rest')) if self.i() < 0 or not rje.yesNo('Output all parsed outputs?'): return False outfile = '%s.rest' % rbase open(outfile,'w').write(self.restFullOutput()) self.printLog('#OUT','full: %s' % (outfile)) return True for rkey in outputs: if rkey in self.dict['Outfile']: rje.backup(self,self.dict['Outfile'][rkey]) open(self.dict['Outfile'][rkey],'w').write(self.dict['Output'][rkey]) self.printLog('#OUT','%s: %s' % (rkey,self.dict['Outfile'][rkey])) elif rkey not in ['intro']: self.warnLog('No outfile parsed/generated for %s output' % rkey)
def katKmers(self, assembly=None, kmerfiles=None, basefile=None, force=None, trim10x=True): ### Performs read kmer kat sect analysis ''' Performs read kmer kat sect analysis. Generates: - '{0}.kat-stats.tsv'.format(basefile) = kmer summary per sequence - '{1}.kat-counts.cvg'.format(basefile) = kmer counts per position (CVG format) >> assembly:str [None] = Assembly file. Will use self.getStr('SeqIn') if None >> kmerfiles:list [None] = files for setting kmers to count (self.list['KmerReads'] if None) >> basefile:bool [None] = output file prefix (self.baseFile() if None) >> force:bool [None] = whether to overwrite existing files (self.force() if None) >> trim10x:bool [True] = Whether to check 10xtrim setting. << katfile or None if failed ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.checkForKat(report=True): return None if not assembly: assembly = self.getStr('SeqIn') seqin = assembly if kmerfiles: if type(kmerfiles) == type('str'): kmerfiles = [kmerfiles] else: if not self.list['KmerReads']: self.printLog( '#KAT', 'Cannot use KAT kmer analysis without KmerReads data') return None kmerfiles = self.list['KmerReads'] rje.checkForFiles(filelist=[seqin] + kmerfiles, basename='', log=self.log, cutshort=False, ioerror=True, missingtext='Not found: aborting KAT run.') if not basefile: basefile = self.baseFile(return_none=None) if not basefile: self.baseFile(rje.baseFile(assembly, strip_path=True)) if force == None: force = self.force() katfile = '{}.kat-stats.tsv'.format(basefile) # seq_name median mean gc% seq_length kmers_in_seq invalid_kmers %_invalid non_zero_kmers %_non_zero %_non_zero_corrected katcvg = '{}.kat-counts.cvg'.format(basefile) #i# Check for files if not force and rje.checkForFiles( filelist=[katfile, katcvg], basename='', log=self.log, cutshort=False, ioerror=False, missingtext='Not found: will generate.'): return katfile self.backup(katfile, appendable=False) self.backup(katcvg, appendable=False) ### ~ [2] Run KAT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### katcall = 'kat sect -t {} -o {}.kat {} {}'.format( self.threads(), basefile, seqin, ' '.join(kmerfiles)) if trim10x and self.getBool('10xTrim'): trim5 = ['16'] + ['0'] * (len(self.list['KmerReads']) - 1) trim5 = ','.join(trim5) katcall = 'kat sect -t {} --5ptrim {} -o {}.kat {} {}'.format( self.threads(), trim5, basefile, seqin, ' '.join(kmerfiles)) self.printLog('#SYS', katcall) #i# Catching completion in case KAT hangs after running KAT = os.popen(katcall) while not KAT.readline().startswith('Total runtime'): continue KAT.close() ### ~ [3] Check for outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if rje.checkForFiles(filelist=[katfile, katcvg], basename='', log=self.log, cutshort=False, ioerror=True, missingtext='Not found: KAT failed?'): return katfile except: self.errorLog('%s.katKmers error' % self.prog()) return None
def inSilicoHybrid( self ): ### Filter and combine subreads from parent and output to fasta file. ''' Filter and combine subreads from parent and output to fasta file. This module generates balanced "in silico diploid" PacBio subread data from two sequenced haploid parents. Each parent must first be run through SMRTSCAPE to generate subread summary data. (This will be performed if missing. Each parent needs a `*.fofn` file of subread file names, `*.unique.tdt` unique subreads table and `*.smrt.tdt` SMRT cell identifier table.) A new set of subreads is then generated from the combined set of parent subreads. This is done by first ranking the unique subreads from each parent by length. First, the longest subread from each parent are compared and the shortest selected to be the first subread of the diploid. (The shortest is taken to minimise length differences between the two parents.) Next, the longest subread from the next parent that is no longer than the previous subread is added. This cycles, picking a read from the the parent with fewest cumulative bases each cycle. The longest subread that is no longer than the previous subread is selected. This continues until one parent runs out of subreads. Additional subreads will be added from the other parent if they reduce the difference in cumulative output for each parent. Final output will be a `*.subreads.fasta` file in which each parent has a similar total sequence content and for which the subread length distributions should also be similar. This is to overcome biases in resulting diploid assemblies, where one parent has higher quality data than the other. NOTE: If performing downstream filtering by Read Quality (RQ), this might reintroduce a bias if one parent has much higher RQ values than the other. The `rqfilter=X` setting can therefore be used to restrict output to reads with a minimum RQ value. By default this is 0.84. If you do not get enough sequence output, this setting may need to be relaxed. ''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] Parent 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 1 ~~~~~~~~~~~~~~~~~~~~ #') self.printLog('#FOFN', 'Parent1: %s' % self.getStr('Parent1')) base1 = rje.baseFile(self.getStr('Parent1')) parent1 = smrtscape.SMRTSCAPE( self.log, ['genomesize=13.1e6'] + self.cmd_list + ['batch=%s' % self.getStr('Parent1'), 'basefile=%s' % base1]) parent1.setup() udb1 = parent1.udb() cdb = parent1.db('smrt', add=True, mainkeys=['Name']) cdb.dataFormat({'SMRT': 'int'}) cx = cdb.entryNum() ## ~ [0a] Parent 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ SETUP PARENT 2 ~~~~~~~~~~~~~~~~~~~~ #') self.printLog('#FOFN', 'Parent2: %s' % self.getStr('Parent2')) base2 = rje.baseFile(self.getStr('Parent2')) parent2 = smrtscape.SMRTSCAPE( self.log, ['genomesize=13.1e6'] + self.cmd_list + ['batch=%s' % self.getStr('Parent2'), 'basefile=%s' % base2]) parent2.setup() udb2 = parent2.udb() cdb2 = parent2.db('smrt', add=True, mainkeys=['Name']) cdb2.dataFormat({'SMRT': 'int'}) # Shift all of the Parent2 SMRT IDs to avoid conflict with Parent1 for entry in cdb2.entries() + udb2.entries(): entry['SMRT'] = entry['SMRT'] + cx cdb = parent1.db().mergeTables(cdb, cdb2) ## ~ [0c] Output Sequence File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog( '#~~#', '# ~~~~~~~~~~~~~~~~~~~~ DIPLOIDOCUS SUBREADS ~~~~~~~~~~~~~~~~~~~~ #' ) minlen = self.getInt('LenFilter') minrq = self.getNum('RQFilter') rqstr = '%s' % minrq filtfile = '%s.L%sRQ%s.fasta' % (self.baseFile(), minlen, rqstr[2:]) ## ~ [0d] Input Sequence Files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqbatch = [] # List of SeqList objects self.printLog( '#BATCH', '%s sequence files to process.' % rje.iLen(parent1.list['Batch'] + parent2.list['Batch'])) for seqfile in parent1.list['Batch'] + parent2.list['Batch']: seqcmd = self.cmd_list + [ 'seqmode=file', 'autoload=T', 'summarise=F', 'seqin=%s' % seqfile, 'autofilter=F' ] seqbatch.append(rje_seqlist.SeqList(self.log, seqcmd)) self.printLog( '#BATCH', '%s sequence files to summarise.' % rje.iLen(seqbatch)) if not seqbatch: raise IOError( 'No batch input fasta files found! Make sure parentN=FILE settings given *.fofn.' ) ## ~ [0e] Setup subread lists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elists = [ udb1.sortedEntries('Len', reverse=True), udb2.sortedEntries('Len', reverse=True) ] plen = [0, 0] # Summed lengths for each parent pseq = [0, 0] # Total sequence number for each parent prq = [0, 0] # Total sequence RQ for each parent (convert to mean) if not elists[0] or not elists[1]: raise ValueError( 'No Unique ZMW subreads for one or both parents!') lastlen = max(elists[0][0]['Len'], elists[1][0]['Len']) # Length of last selected read for elist in elists: while elist and elist[0]['RQ'] < minrq: elist.pop(0) if not elists[0] or not elists[1]: raise ValueError( 'No Unique ZMW subreads for one or both parents!') nextp = 0 # Index of next parent to use if elists[0][0]['Len'] < elists[1][0]['Len']: nextp = 1 ### ~ [1] Filter and Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Filter Unique Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## zmwlist = [] # List of (smrt,zmw) meeting filtering criteria ux = 0.0 utot = len(elists[0]) + len(elists[1]) while lastlen: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) elist = elists[nextp] while elist and elist[0]['RQ'] < minrq: elist.pop(0) ux += 100.0 if elist and elist[0]['Len'] < minlen: ux += 100.0 * len(elist) elist = [] if not elist: nextp = 1 - nextp break # Finish entry = elist.pop(0) ux += 100.0 zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos'])) plen[nextp] += entry['Len'] prq[nextp] += entry['RQ'] pseq[nextp] += 1 if plen[1 - nextp] <= plen[nextp]: nextp = 1 - nextp lastlen = entry['Len'] ## ~ [1b] Final processing of last reads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## while elists[nextp]: elist = elists[nextp] while elist and elist[0]['RQ'] < minrq: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) elist.pop(0) ux += 100.0 while elist and elist[0]['Len'] >= minlen: self.progLog('\r#DIP', 'Diploidising subreads: %.2f%%' % (ux / utot)) entry = elist.pop(0) ux += 100.0 pdiff = rje.modulus(plen[0] - plen[1]) ediff = rje.modulus(plen[nextp] + entry['Len'] - plen[1 - nextp]) if ediff >= pdiff: elists[nextp] = [] break #Finish! zmwlist.append((entry['SMRT'], entry['ZMW'], entry['Pos'])) plen[nextp] += entry['Len'] prq[nextp] += entry['RQ'] pseq[nextp] += 1 self.printLog( '\r#DIP', 'Diploidising subreads complete: %s subreads to output.' % rje.iLen(zmwlist)) self.printLog( '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' % (self.getStr('Parent1'), rje.iStr(pseq[0]), rje.iStr(plen[0]), 1.0 * plen[0] / self.getInt('GenomeSize'), prq[0] / pseq[0])) self.printLog( '\r#DIP', '%s: %s seq; %s bp (%.1fX); %.3f mean RQ.' % (self.getStr('Parent2'), rje.iStr(pseq[1]), rje.iStr(plen[1]), 1.0 * plen[1] / self.getInt('GenomeSize'), prq[1] / pseq[1])) ## ~ [1b] Extract Filtered Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## rje.backup(self, filtfile) SEQOUT = open(filtfile, 'w') sx = 0.0 stot = 0 sn = len(seqbatch) fx = 0 for seqlist in seqbatch: #>m150625_001530_42272_c100792502550000001823157609091582_s1_p0/9/0_3967 RQ=0.784 si = 100.0 / seqlist.seqNum() stot += seqlist.seqNum() for seq in seqlist.seqs(): self.progLog('\r#OUT', 'Extracting subreads: %.2f%%' % (sx / sn)) sx += si (name, sequence) = seqlist.getSeq(seq) try: [smrt, zmw, pos, rq] = string.split(string.replace(name, '/', ' ')) except: [smrt, zmw, pos] = string.split(string.replace(name, '/', ' ')) rq = minrq if (cdb.data(smrt)['SMRT'], int(zmw), pos) not in zmwlist: continue SEQOUT.write('>%s\n%s\n' % (name, sequence)) fx += 1 self.printLog( '\r#OUT', 'Saved %s filtered subreads to %s.' % (rje.iStr(fx), filtfile)) ### ~ [2] Summarise Filtered File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqcmd = self.cmd_list + [ 'seqmode=file', 'autoload=T', 'summarise=T', 'seqin=%s' % filtfile, 'autofilter=F' ] rje_seqlist.SeqList(self.log, seqcmd) return True except: self.errorLog('%s.run error' % self.prog()) return False
def mapPhosByBLAST(self,fasfile): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + ['seqin=%s' % fasfile,'autoload=T','autofilter=F'] qseqlist = rje_seq.SeqList(self.log,scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['','none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name','Pos','AA','PELM','PELMPos','Evidence'] delimit = rje.getDelimit(self.cmd_list,rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self,ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log,self.cmd_list+['formatdb=F']) pblast.setInfo({'Name':'%s.p.blast' % rje.baseFile(fasfile),'DBase':self.info['PELMFas'],'InFile':fasfile}) pblast.setStat({'HitAln':pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID','Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0,self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True,log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F','autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq:0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys(self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False,gaps=False) != hseq.getSequence(case=False,gaps=False): self.log.errorLog('Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec']) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0 aln.muscleAln() #x#outfile='%s%s.phosaln.fas' % (ppath,qseq.info['AccNum'])) aln._addSeq('PhosAln','-' * qseq.seqLen()) aln.info['Name'] = '%s%s.phosaln.fas' % (ppath,qseq.info['AccNum']) ## ~ [2b] Map phosphorylations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## print '>>>\n', aln.seq, pdict.keys(), rdict.keys() for a in range(qseq.seqLen()): if qseq.info['Sequence'][a] != '-': rdict[qseq] += 1 for hseq in pdict: if hseq.info['Sequence'][a] == '-': continue if hseq != qseq: rdict[hseq] += 1 if rdict[hseq] in pdict[hseq] and qseq.info['Sequence'][a] == hseq.info['Sequence'][a]: # Phosphosite pdata = {'Name':search.info['Name'],'Pos':rdict[qseq],'AA':qseq.info['Sequence'][a], 'PELM':hseq.shortName(),'PELMPos':rdict[hseq],'Evidence':'Hom'} if hseq == qseq: pdata['Evidence'] = 'Self' elif hseq in idlist: pdata['Evidence'] = 'ID' rje.delimitedFileOutput(self,self.info['PhosRes'],headers,delimit,pdata) self.addPhos(aln.seq[-1],a,pdata['Evidence']) ## ~ [2c] Add Scansite/NetPhos if made? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ## ~ [2d] Save alignment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## aln.saveFasta() # Align hits for each > X %ID # Map phosphosites onto alignment and output # return except: self.log.errorLog('Problem during PhosphoSeq.mapPhosByBLAST')
def batchRun(self, returnobj=False): ### Execute batch mode runs '''Execute batch mode runs.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### barg = self.getStrLC('BatchArg') if not barg: raise ValueError( 'Cannot use batchrun=FILELIST if batcharg=None.') batchfiles = self.list['BatchRun'][0:] self.list['BatchRun'] = [] # Avoid recursive running! blog = self.getStr('BatchLog') if not blog.startswith('.'): blog = '.%s' % blog if not blog.endswith('.log'): blog = '%s.log' % blog rawcmd = self.cmd_list[0:] rawlog = self.log batchobj = [] ### ~ [1] Batch Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### bx = 0 for bfile in batchfiles: bx += 1 self.printLog( '#BATCH', 'Batch running %s of %s: %s=%s' % (rje.iStr(bx), rje.iLen(batchfiles), barg, bfile)) ## Setup parameters bbase = rje.baseFile(bfile, strip_path=True) bcmd = ['%s=%s' % (barg, bfile)] if self.getBool('BatchBase'): if blog == '.log': bcmd += ['basefile=%s' % bbase] else: bcmd += ['basefile=%s%s' % (bbase, rje.baseFile(blog))] elif self.getStrLC('BatchLog'): bcmd += ['log=%s%s' % (bbase, blog)] else: bcmd += ['newlog=F'] #self.debug(bcmd) ## Setup Seqsuite object self.cmd_list = rawcmd + bcmd self.log = rje.setLog( self.log.obj['Info'], self, self.cmd_list ) # Sets up Log object for controlling log file output ## Run batchobj.append(self.run()) ## Finish and Tidy self.log = rawlog runobj = batchobj[-1] if runobj: if not returnobj: batchobj[-1] = True info = runobj.log.obj['Info'] self.printLog( '#RUN', '%s V%s run finished.' % (info.program, info.version)) else: self.warnLog('Batch run failed (%s=%s).' % (barg, bfile)) ### ~ [2] Finish and Return ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### failx = batchobj.count(False) self.printLog( '#BATCH', '%s batch runs complete: %s failed.' % (rje.iLen(batchfiles), rje.iStr(failx))) self.list['BatchRun'] = batchfiles return batchobj except: self.errorLog('%s.batchRun error' % self) return False
def setup(self, gtext='' ): ### Main class setup method. gtext will over-ride input file. '''Main class setup method. gtext will over-ride input file.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['HTML'] = rje_html.HTML(self.log, self.cmd_list) ## ~ [1a] File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.basefile().lower() in ['', 'none']: self.basefile(rje.baseFile(self.getStr('InFile'))) if self.getStr('OutFile').lower() in ['', 'none']: self.str['OutFile'] = '%s.html' % self.basefile() ## ~ [1b] Read in Glossary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## interms = [] if gtext: delimit = self.getStr('TermSplit') if delimit.lower() == 'tab': delimit = '\t' if delimit.lower() == 'space': delimit = ' ' if delimit.lower() == 'comma': delimit = ',' if delimit.lower() == 'period (.)': delimit = '.' if delimit.lower() == 'colon': delimit = ':' glossary = {} for line in string.split(gtext, '\n'): splitline = string.split(line, delimit) if delimit == '.' and (splitline[-1] in ['', ' ']): splitline = splitline[:-1] if not splitline: continue (term, definition) = (splitline[0], string.join(splitline[1:], delimit)) if term == 'Term' and not glossary: continue if term: glossary[term] = {'Definition': definition} interms.append(term) else: try: if not self.getBool('KeepOrder') and open( self.getStr('InFile'), 'r').readline()[:4] == 'Term': glossary = rje.dataDict( self, self.getStr('InFile'), mainkeys=['Term'], datakeys=['Term', 'Definition']) else: return self.setup( open(self.getStr('InFile'), 'r').read()) except: self.errorLog( 'Problem reading input as dataDict(). Will try as text.' ) return self.setup(open(self.getStr('InFile'), 'r').read()) if self.list['Terms']: for term in glossary: if term not in self.list['Terms']: glossary.pop(term) elif self.getBool('KeepOrder'): self.list['Terms'] = interms else: self.list['Terms'] = rje.sortKeys(glossary) for term in glossary: glossary[term] = glossary[term]['Definition'] ### ~ [2] Create Full Glossary Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nested = {} for term in glossary: tdict = nested for word in string.split(term.lower()): if word not in tdict: tdict[word] = {} tdict = tdict[word] tdict['='] = glossary[term] self.dict['Glossary'] = nested return True # Setup successful except: self.errorLog('Problem during %s setup.' % self) return False # Setup failed
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DB'] = rje_db.Database(self.log,self.cmd_list+['tuplekeys=T']) if self.baseFile().lower() in ['','none']: self.baseFile('%s.vs.%s.Q%d' % (rje.baseFile(self.getStr('MutPileup'),True),rje.baseFile(self.getStr('WTPileup'),True),self.getInt('QCut'))) if not self.force() and os.path.exists('%s.fdr.tdt' % self.baseFile()): return ### ~ [2] Look for/process WT Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.force() or not os.path.exists('%s.WT.tdt' % self.baseFile()): self.parsePileup('WT',self.getStr('WTPileup')) ### ~ [3] Generate Reference sequences and Major Alleles (by locus) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### refseq = {}; rx = 0 majors = {} locus = None WTDATA = open('%s.WT.tdt' % self.baseFile(),'r'); wx = 0 for line in WTDATA: self.progLog('\r#WT','Reading WT data: Reference seq length = %s nt' % (rje.iStr(rx)),rand=0.01) data = rje.readDelimit(line); wx += 1 if data[0] == 'Locus': continue else: if data[0] != locus: locus = data[0]; refseq[locus] = ''; majors[locus] = [] pos = int(data[1]) while (pos - 1) > len(refseq[locus]): refseq[locus] += '?'; rx += 1 while (pos - 1) > len(majors[locus]): majors[locus].append('-') refseq[locus] += data[2]; majors[locus].append(data[5]); rx += len(data[2]) WTDATA.close() self.printLog('\r#WT','%s lines read from WT data: Reference seq length = %s nt' % (rje.iStr(wx),rje.iStr(rx))) for locus in rje.sortKeys(majors): if len(majors[locus]) != len(refseq[locus]): self.errorLog('%s WTMajor versus RefSeq length mismatch!' % locus,printerror=False); raise ValueError self.dict['WTMajor'] = majors self.dict['RefSeq'] = refseq ### ~ [3] Look for/process Mutant Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.force() or not os.path.exists('%s.Mut.tdt' % self.baseFile()): self.parsePileup('Mut',self.getStr('MutPileup'),True) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def treeListSPCode(self): ### Main taxa mapping from list of tree files '''Main taxa mapping from list of tree files.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() specdb = self.db('spcode', add=True, forcecheck=True, mainkeys=['protein']) if not specdb and self.getStrLC('TaxBase') and not self.force(): spfile = '%s.spcode.tdt' % self.getStr('TaxBase') specdb = db.addTable(spfile, mainkeys=['protein'], name='spcode', expect=False) if specdb: specdb.dataFormat({'boot': 'num'}) return True specdb = db.addEmptyTable( 'spcode', ['protein', 'boot', 'spcode', 'inpara', 'paralogues'], ['protein']) #dupdb = db.addEmptyTable('para',['protein','paralogues'],['protein']) self.dict['Duplicates'] = {} # {prot1:[dups]} ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for nwkfile in self.list['NwkList']: tree = rje_tree.Tree(self.log, self.cmd_list) tree.loadTree(nwkfile, seqlist=None, postprocess=False) seqacc = rje.baseFile(nwkfile, strip_path=True) # Identify node corresponding to query sequence seqnode = None for node in tree.nodes(): try: if string.split(node.shortName(), '__')[1] == seqacc: seqnode = node except: pass # Internal node or bad sequence format if not seqnode: self.warnLog('Could not find %s in %s nodes!' % (seqacc, nwkfile)) continue # Get species code for query sequence seqspec = tree.cladeSpec(seqnode) if len(seqspec) != 1: self.warnLog('Could not find species in %s node!' % (seqacc)) continue seqspec = seqspec.keys()[0] if seqspec != string.split(seqnode.shortName(), '_')[1]: raise ValueError('Species mismatch for %s & %s' % (seqacc, seqnode.shortName())) # Find ancestor with closest orthologue outgroup rootnode = tree._getRootNode() if not rootnode: self.warnLog('Could not find root node in %s!' % (nwkfile)) continue ancnode = seqnode.ancNode() try: bootx = float(ancnode.ancBranch().stat['Bootstrap'] ) / tree.stat['Bootstraps'] except: bootx = 1.0 inparanode = None # Node to define in-paralogues ancspec = tree.cladeSpec(ancnode) while len(ancspec) < 2 or bootx < self.getNum('MinBoot'): inparanode = ancnode # All same species if ancnode == rootnode: break ancnode = ancnode.ancNode() ancspec = tree.cladeSpec(ancnode) try: bootx = float(ancnode.ancBranch().stat['Bootstrap'] ) / tree.stat['Bootstraps'] except: bootx = 1.0 ancspec.pop( seqspec) # Now only have counts of closest other species # Update table, replacing species codes with genera? sentry = { 'protein': seqacc, 'spcode': rje.sortUnique(ancspec.keys()) } sentry['boot'] = bootx if not ancspec: sentry['spcode'] = ['None'] sentry['boot'] = self.getNum('NoneBoot') sentry['spcode'] = string.join(sentry['spcode'], '|') # Establish list of duplicate proteins inpara = [] # List of in-paralogue nodes inparacc = [] # List of in-paralogue accnum if inparanode: inpara = tree._nodeClade(inparanode, internal=False) self.dict['Duplicates'][seqacc] = [] for node in tree._nodeClade(rootnode, internal=False): if node == seqnode: continue if len(string.split(node.shortName(), '_')) < 2: continue if string.split(node.shortName(), '_')[1] == seqspec: paracc = string.split(node.shortName(), '__')[1] if node in inpara: inparacc.append(paracc) else: self.dict['Duplicates'][seqacc].append(paracc) sentry['inpara'] = string.join(inparacc, '|') sentry['paralogues'] = string.join( self.dict['Duplicates'][seqacc], '|') specdb.addEntry(sentry) ## Update specdb and save specdb.saveToFile() #dupdb.saveToFile() return True except: self.errorLog(self.zen()) return False
def _pepDis(self): ### Peptide Distance ''' Peptide Distance. ''' try: ### <0> ### Setup seqlist = rje_seq.SeqList(self.log, self.cmd_list + ['autoload=T']) dismatrix = rje_dismatrix.DisMatrix(self.log, self.cmd_list) dismatrix.info['Name'] = self.info['Method'] dismatrix.opt['Symmetric'] = True if self.info['Method'] in ['ds_prop', 'tot_prop', 'best_prop']: aaprop = rje_aaprop.AAPropMatrix(self.log, self.cmd_list) #aaprop.readAAProp() aaprop.makePropDif() elif self.info['Method'] == 'pam': pam = rje_pam.PamCtrl(log=self.log, cmd_list=self.cmd_list) ### <1> ### Make DisMatrix for seq1 in seqlist.seq: for seq2 in seqlist.seq: if seqlist.seq.index(seq1) > seqlist.seq.index( seq2): # No need to calculate - symmetrical! continue dis = 0 if seq1 == seq2 and self.info['OutMatrix'] == 'phylip': dis = 0 elif self.info['Method'] in ['ds_prop', 'ds_id']: (self_dis1, self_dis2) = (0, 0) for r1 in range(seq1.seqLen()): for r2 in range(r1, seq2.seqLen()): (a1, a2) = (seq1.info['Sequence'][r1], seq2.info['Sequence'][r2]) (s1, s2) = (seq1.info['Sequence'][r2], seq2.info['Sequence'][r1]) phys_dis = r2 - r1 if self.info['Method'] == 'ds_prop': dis += (aaprop.pdif['%s%s' % (a1, a2)] * (seq1.seqLen() - phys_dis)) self_dis1 += (aaprop.pdif['%s%s' % (a1, s1)] * (seq1.seqLen() - phys_dis)) self_dis2 += (aaprop.pdif['%s%s' % (a2, s2)] * (seq1.seqLen() - phys_dis)) elif self.info[ 'Method'] == 'ds_id' and a1 != a2: dis += (seq1.seqLen() - phys_dis) if self.info['Method'] == 'ds_id' and a1 != s1: self_dis1 += (seq1.seqLen() - phys_dis) if self.info['Method'] == 'ds_id' and a2 != s2: self_dis2 += (seq1.seqLen() - phys_dis) dis -= (self_dis1 + self_dis2) / 2.0 elif self.info['Method'] == 'tot_prop': proptot = {} for property in aaprop.prop.keys(): proptot[property] = {seq1: 0.0, seq2: 0.0} for seq in [seq1, seq2]: for r in range(seq.seqLen()): aa = seq.info['Sequence'][r] for property in aaprop.prop.keys(): proptot[property][seq] += string.atof( aaprop.prop[property][aa]) for property in aaprop.prop.keys(): if proptot[property][seq1] > proptot[property][ seq2]: dis += (proptot[property][seq1] - proptot[property][seq2]) else: dis += (proptot[property][seq2] - proptot[property][seq1]) elif self.info['Method'] == 'pam': dis = pam.pamML(ancseq=seq1.info['Sequence'], descseq=seq2.info['Sequence']) elif self.info['Method'] == 'best_prop': min_dis = seq1.seqLen() * len(aaprop.prop) pepseq1 = seq1.info['Sequence'] for c in range(seq1.seqLen()): # Circular start dis = 0 pepseq2 = seq2.info['Sequence'][c:] + seq2.info[ 'Sequence'][:c] for r in range(seq1.seqLen()): (a1, a2) = (pepseq1[r], pepseq2[r]) dis += aaprop.pdif['%s%s' % (a1, a2)] if dis < min_dis: min_dis = dis dis = min_dis dismatrix.addDis(seq1, seq2, dis) ### <2> ### Output if self.info['OutMatrix'] == 'phylip': delimit = ' ' format = 'phylip' else: delimit = rje.getDelimit(self.cmd_list, ',') format = 'None' outfile = '%s.%s.%s' % (rje.baseFile( seqlist.info['Name'], True), self.info['Method'], rje.delimitExt(delimit)) dismatrix.saveMatrix(seqlist.seq, outfile, delimit, format=format) except: self.log.errorLog('Error in _pepDis', printerror=True, quitchoice=False) raise # Delete this if method error not terrible
def forking( self ): ### Keeps forking out and processing jobs until no more jobs in self.list['Forked']. '''Keeps forking out and processing jobs until no more jobs in self.list['Forked'].''' ### ~ [1] ~ Start first set of jobs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getBool('PIDCheck') or self.dev(): pidcheck = '%s.pid' % rje.baseFile( self.log.info['LogFile']) # Set *.pid object to match log else: pidcheck = None #self.deBug(pidcheck) ### ~ [2] ~ Monitor jobs and set next one running as they finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### while self.list['Forked']: if not self.getBool('LogFork'): self.progLog( '\r#FORK', 'Forking jobs: {0} running; {1} remain.'.format( len(self.list['Forked']), rje.iLen(self.list['ToFork']))) if pidcheck: PIDCHECK = open(pidcheck, 'w') for fdict in self.list['Forked'][0:]: try: pid = fdict['PID'] if pidcheck: PIDCHECK.write('%s: %s\n' % (self.list['Forked'].index(fdict), pid)) if string.split('%s' % pid)[0] == 'WAIT': status = 1 else: (status, exit_stat) = os.waitpid(pid, os.WNOHANG) except: self.errorLog('!') status = 1 if status > 0: self.list['Forked'].remove(fdict) self.endFork( fdict ) # Fork has finished: can replace with processing if pidcheck: PIDCHECK.close() #self.deBug(open(pidcheck,'r').read()) ## ~ [2a] Look for eternal hanging of threads ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if time.time() - self.getNum('KillTime') > self.getNum( 'KillForks'): self.verbose( 0, 1, '\n%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'), len(self.list['Forked'])), 1) for fdict in self.list['Forked']: self.verbose( 0, 2, ' => Fork %s, PID %d still Active!' % (fdict['ID'], fdict['PID']), 1) if (self.i() < 0 and self.getBool('KillMain')) or rje.yesNo( 'Kill Main Thread?', default={ True: 'N', False: 'Y' }[self.getBool('KillMain')]): raise ValueError( '%d seconds of main thread inactivity. %d forks still active!' % (self.getNum('KillForks'), len(self.list['Forked']))) elif self.i() < 0 or rje.yesNo('Kill hanging forks?'): self.printLog( '#KILL', 'KillForks=%d seconds walltime reached.' % (self.getNum('KillForks'))) for fdict in self.list['Forked']: self.printLog( '#KILL', 'Killing Fork %s, PID %d.' % (fdict['ID'], fdict['PID'])) os.system('kill %d' % fdict['PID']) else: self.setNum({'KillTime': time.time()}) ## ~ [2b] Sleep ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## time.sleep(self.getNum('ForkSleep'))
def mapPhosByBLAST( self, fasfile ): ### BLAST sequences against phosphoDB, align hits & mark sites (ID & Homology) '''BLAST sequences against phosphoDB, align hits and mark phosphosites (ID & Homology).''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] Setup fasfile ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scmd = self.cmd_list + [ 'seqin=%s' % fasfile, 'autoload=T', 'autofilter=F' ] qseqlist = rje_seq.SeqList(self.log, scmd) qdict = qseqlist.seqNameDic() ## ~ [1b] Setup results files/directories ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## basefile = rje.baseFile(fasfile) if self.info['PhosRes'].lower() in ['', 'none']: self.info['PhosRes'] = '%s.phosres.tdt' % basefile headers = ['Name', 'Pos', 'AA', 'PELM', 'PELMPos', 'Evidence'] delimit = rje.getDelimit( self.cmd_list, rje.delimitFromExt(filename=self.info['PhosRes'])) rje.delimitedFileOutput(self, self.info['PhosRes'], headers, delimit, rje_backup=True) ppath = rje.makePath('PhosALN') rje.mkDir(self, ppath) ## ~ [1c] Setup BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## pblast = rje_blast.BLASTRun(self.log, self.cmd_list + ['formatdb=F']) pblast.setInfo({ 'Name': '%s.p.blast' % rje.baseFile(fasfile), 'DBase': self.info['PELMFas'], 'InFile': fasfile }) pblast.setStat({'HitAln': pblast.stat['OneLine']}) pblast.opt['Complexity Filter'] = False pblast.formatDB(force=False) ## ~ [1d] Setup GABLAM Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## gkey = 'GABLAMO ID' #x# % self.info['GABLAMO Key'] for g in ['ID', 'Hom']: if self.stat['%sSim' % g] < 1.0: self.stat['%sSim' % g] *= 100.0 self.stat['%sSim' % g] = max(0.0, self.stat['%sSim' % g]) ### ~ [2] PhosphoBLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pblast.blast(use_existing=True, log=True) # BLAST pblast.readBLAST(gablam=True) # Read in while pblast.search: ## ~ [2a] Align relevant hits from each BLAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## search = pblast.search.pop(0) qseq = qdict[search.info['Name']] idlist = [] qlen = qseq.aaLen() hitdict = search.hitSeq(self.obj['SeqList']) aln = rje_seq.SeqList( self.log, self.cmd_list + ['autoload=F', 'autofilter=F']) aln.seq = [qseq] pdict = {} # Dictionary of {hseq:[poslist]} rdict = {qseq: 0} # Dictionary of {hseq:res} for hit in search.hit[0:]: hseq = hitdict[hit] pdict[hseq] = [] for pos in rje.sortKeys( self.dict['PhosphoSites'][hseq.info['AccNum']]): pdict[hseq].append(pos) if hit.info['Name'] == search.info['Name']: if qseq.getSequence(case=False, gaps=False) != hseq.getSequence( case=False, gaps=False): self.log.errorLog( 'Major problem: Search/Hit sequence mismatch for same sequence "%s"' % hit.info['Name']) idlist.append(qseq) pdict[qseq] = pdict.pop(hseq) continue gdict = hit.globalFromLocal(qlen) qvh = float(100 * gdict['Query'][gkey]) / float(qlen) if qvh < self.stat['HomSim']: pdict.pop(hseq) continue aln.seq.append(hseq) if (qseq.sameSpec(hseq) or not self.opt['UseSpec'] ) and qvh >= self.stat['IDSim']: idlist.append(hseq) rdict[hseq] = 0
def picsi(self): ### Cleans up cross-species search results '''Cleans up cross-species search results.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datafile = self.info['SumFile'] delimit = rje.delimitFromExt(filename=self.info['SumFile']) data = {} # search:{hit:{???}} pep2prot = {} # search:{peptide:[hits]} id2prot = {} # search:{id:hit} prot2desc = {} fullpeplist = {} pepcon = {} # Convert pep:longer pep speclist = [] # List of species codes ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True) for ikey in rje.sortKeys(indata): (search,id) = string.split(ikey,delimit) prot = indata[ikey]['prot_acc'][0] desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','') if desc[3:7] == 'Name': desc = desc[9:] prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc)) indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|') pepconv = string.replace(indata[ikey]['pep_seq'],'I','L') pepconv = string.replace(pepconv,'Q','K') peplist = rje.sortUnique(string.split(pepconv,'|')) indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|') if search not in data: data[search] = {} pep2prot[search] = {} id2prot[search] = {} fullpeplist[search] = [] pepcon[search] = {} fullpeplist[search] += peplist id2prot[search][id] = prot spec = string.split(prot,'_')[1] if spec not in speclist: speclist.append(spec) data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec, 'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:], 'pep_rem':0} try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum'] except: data[search][prot]['accnum'] = string.split(prot,'__')[-1] for pep in peplist: if pep not in pep2prot[search]: pep2prot[search][pep] = [] pep2prot[search][pep].append(prot) ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for search in fullpeplist: fullpeplist[search] = rje.sortUnique(fullpeplist[search]) for pep in fullpeplist[search][0:]: for pep2 in fullpeplist[search]: if pep != pep2 and pep in pep2: pepcon[search][pep] = pep2 fullpeplist[search].remove(pep) break for pep in pepcon[search]: while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]] self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search)) #self.deBug(pepcon[search]) #self.deBug(rje.sortKeys(pep2prot[search])) pp = 0; pm = 0 for prot in data[search]: for pep in data[search][prot]['conpep'][0:]: if pep in pepcon[search]: newpep = pepcon[search][pep] if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1 data[search][prot]['conpep'].remove(pep); pm += 0 if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot) if pep in pep2prot[search]: pep2prot[search].pop(pep) data[search][prot]['pep_con'] = len(data[search][prot]['conpep']) self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm)) ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for search in pep2prot: ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## remx = 0 for prot in data[search]: if data[search][prot]['spec'] != self.info['QrySpec']: continue for pep in data[search][prot]['conpep']: for prot2 in pep2prot[search][pep][0:]: if data[search][prot2]['spec'] == self.info['QrySpec']: continue pep2prot[search][pep].remove(prot2) data[search][prot2]['conpep'].remove(pep) data[search][prot2]['pep_rem'] += 1; remx += 1 self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec'])) ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for prot in data[search]: if len(data[search][prot]['conpep']) < 2: for pep in data[search][prot]['conpep']: #if pep in pep2prot[search] and prot in pep2prot[search][pep]: pep2prot[search][pep].remove(prot) ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ux = 0 for pep in pep2prot[search]: #self.deBug(pep) if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1 self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search)) ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0} rx = 0 for prot in data[search]: #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue data[search][prot]['pep_red'] = 0 # Redundant peptides found in proteins with unique peptides data[search][prot]['pep_nr'] = 0 # Redundant peptides found only in proteins without unique peptides for pep in data[search][prot]['conpep']: if pep2prot[search][pep] == [prot]: continue upep = False for prot2 in pep2prot[search][pep]: if data[search][prot2]['pep_uniq']: upep = True; break if upep: data[search][prot]['pep_red'] += 1 # Redundant peptide found in unique protein else: data[search][prot]['pep_nr'] += 1 # Redundant peptide NOT found in unique protein if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1 elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE' elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR' else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1 summary[data[search][prot]['class']] += 1 self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search)) for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x)) ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### speclist.sort() species = {} for spec in speclist: try: grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read() species[spec] = string.split(grep,':')[-4] self.printLog('#SPEC','%s = %s' % (spec,species[spec])) except: species[spec] = '?' ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile']) headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep'] if self.dict['Acc2Seq']: headers.insert(3,'cluster') rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True) for search in rje.sortKeys(data): if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search]) for prot in rje.sortKeys(data[search]): if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']): data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1] else: data[search][prot]['species'] = species[data[search][prot]['spec']] rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot]) except: self.errorLog('Errg')
def blast2fas(self): ### Executes BLAST2FAS and copies results files '''Executes BLAST2FAS and copies results files.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### need2blast = self.opt['Force'] null_file = '%s.blast2fas_null.txt' % self.baseFile() nx = 0 null_list = [] if os.path.exists(null_file): null_list = string.split(open(null_file, 'r').read(), '\n') self.debug(null_file) for seq in self.seqs(): if seq.info['AccNum'] in null_list: nx += 1 continue hfile = rje.makePath('%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']), wholepath=True) for db in self.obj['SeqList'].list['Blast2Fas']: self.debug(rje.isYounger(hfile, db)) self.debug(rje.isYounger(hfile, db) == hfile) need2blast = need2blast or not rje.isYounger(hfile, db) == hfile if not need2blast: self.printLog( '#BLAST', 'All HAQESAC input files found (%s w/o BLAST hits) - no BLAST2Fas (force=F)' % nx) return False ### ~ [2] Execute ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.backup(self, null_file) nx = 0 if self.getInt('MultiCut'): self.obj['SeqList'].cmd_list += [ 'blastb=%d' % self.getInt('MultiCut'), 'blastv=%d' % self.getInt('MultiCut') ] elif self.getInt('BlastCut'): self.obj['SeqList'].cmd_list += [ 'blastb=%d' % self.getInt('BlastCut'), 'blastv=%d' % self.getInt('BlastCut') ] if self.getInt('Forks'): self.obj['SeqList'].cmd_list += [ 'blasta=%d' % self.getInt('Forks') ] rje_seq.Blast2Fas(self.obj['SeqList'], self.getStr('HAQBLASTDir')) for seq in self.seqs(): sbfile = '%s%s.blast.fas' % (self.getStr('HAQBLASTDir'), seq.info['AccNum']) if os.path.exists(sbfile): hfile = rje.makePath( '%s%s.fas' % (self.info['HaqDir'], seq.info['AccNum']), wholepath=True) os.rename(sbfile, hfile) if os.path.exists('%s.pickle' % rje.baseFile(hfile)): os.unlink('%s.pickle' % rje.baseFile(hfile)) if os.path.exists('%s.pickle.gz' % rje.baseFile(hfile)): os.unlink('%s.pickle.gz' % rje.baseFile(hfile)) else: open(null_file, 'a').write('%s\n' % seq.info['AccNum']) nx += 1 if nx: self.printLog( '#BLAST', '%s Accession Numbers without BLAST2Fas hits output to %s' % (nx, null_file)) self.printLog( '#BLAST', '%s HAQESAC input files made using BLAST2Fas' % (self.seqNum() - nx)) return True except: self.errorLog('Major problem with MultiHAQ.blast2fas') raise
def hmmTable(self, outfile='', append=False, delimit=None): ### Outputs results table ''' Outputs results table. >> outfile:str = Name of output file >> append:boolean = whether to append file >> delimit:str = Delimiter to use [\t] ''' try: ### Setup ### if not outfile: outfile = self.info['HMMTab'] if outfile.lower() == 'none': self.log.printLog('#TAB', 'HMMTab = "None": No table output') return False if not delimit: delimit = rje.getDelimit(self.cmd_list, '\t') if not outfile: outfile = '%s.hmmer.%s' % (rje.baseFile( self.info['SearchDB'], True), rje.delimitExt(delimit)) self.readResults() self.log.printLog('#TAB', 'Tabulating results for %s searches into %s' % (len(self.search), outfile), log=False) ### Setup Resfile ### if self.opt['MySQL']: headers = [ 'HMM', 'Hit', 'Hit_Start', 'Hit_End', 'Eval', 'Score' ] else: headers = ['Type', 'Name', 'Start', 'End', 'Eval', 'Score'] if not append or not os.path.exists(outfile): rje.delimitedFileOutput(self, outfile, headers, delimit, rje_backup=True) ### Output Search details ### for search in self.search: for hit in search.hit: for aln in hit.aln: out = { 'HMM': search.info['Name'], 'Type': search.info['Name'], 'Name': hit.info['Name'], 'Hit': hit.info['Name'], 'Start': '%d' % aln.stat['SbjStart'], 'End': '%d' % aln.stat['SbjEnd'], 'Hit_Start': '%d' % aln.stat['SbjStart'], 'Hit_End': '%d' % aln.stat['SbjEnd'], 'Eval': '%.2e' % aln.stat['Expect'], 'Score': '%.1f' % aln.stat['BitScore'] } rje.delimitedFileOutput(self, outfile, headers, delimit, out) self.log.printLog( '#OUT', 'Results for %s searches output to %s.' % (len(self.search), outfile)) except: self.log.errorLog('Fatal Error during hmmTable(%s).' % outfile) raise
def hmmSearch( self, hmm, dbase=None, outfile=None, wait=True): ### Performs HMMer Search using object attributes ''' Performs HMMer Search using object attributes. >> hmm:str = Name of HMM file >> dbase:str = Name of DBase file [self.info['SearchDB']] >> outfile:str = Name of Output file file [self.info['HMMOut']] >> wait:boolean = whether to wait for HMMer. [True] << returns outfile or None if fails ''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.checkForFile(hmm): self.printLog('#ERR', 'HMM file %s is missing!' % hmm) return None if not dbase: dbase = self.info['SearchDB'] if not rje.checkForFile(dbase): self.printLog('#ERR', 'Database file "%s" is missing!' % dbase) return None ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not outfile or outfile.lower() in [ '', 'none' ]: # Make an outfile per search outfile = '%s.%s.hmmer' % (rje.baseFile( hmm, True), rje.baseFile(dbase, True)) resfile = outfile if not os.path.exists( outfile) and self.opt['GZip'] and os.path.exists( '%s.gz' % outfile) and not self.opt['Force']: resfile = '%s.gz' % outfile if not self.opt['Force'] and rje.isYounger( resfile, hmm) == resfile and rje.isYounger( resfile, dbase) == resfile: self.printLog('#HMM', 'HMM results file "%s" exists.' % resfile) return outfile # Already exists else: rje.backup(self, outfile, unlink=True) ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['HMMPFam']: _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join( self.list['HMMOptions']), hmm, dbase, outfile) else: _command = 'hmmsearch %s %s %s > %s' % (string.join( self.list['HMMOptions']), hmm, dbase, outfile) self.log.printLog('#HMM', _command) if not wait: os.system(self.info['HMMerPath'] + _command + ' &') elif not os.path.exists(outfile) or self.opt['Force']: open(outfile, 'a').write( os.popen(self.info['HMMerPath'] + _command).read()) self.printLog('#HMM', 'Outfile produced for %s: %s.' % (hmm, outfile)) if self.opt['GZip']: rje.backup(self, '%s.gz' % outfile, unlink=True) os.system('gzip %s' % outfile) self.printLog('#GZIP', '%s gzipped to save space' % outfile) return outfile except: self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm) return None
def uniFake( self, seqs=[], store=False ): ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs. '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### unifake = string.split(string.join(self.list['UniFake']).lower()) seqlist = self.obj['SeqList'] if seqs: seqlist.seq = seqs else: seqs = seqlist.seq (sx, seqnum) = (0, seqlist.seqNum()) ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = rje_uniprot.UniProt( self.log, self.cmd_list) # UniProt object for saving data if self.info['DatOut'].lower() in ['', 'none']: self.info['DatOut'] = rje.baseFile( seqlist.info['Name']) + '.dat' datfile = self.info['DatOut'] if os.path.exists(datfile): rje.backup(self, datfile) if store: seqlist.obj['UniProt'] = uniprot ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'pfam' in unifake: hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T']) hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile) if os.path.exists(hmmfile): rje.backup(self, hmmfile) hmm.list['HMM'] = [self.info['PFam']] hmm.opt['HMMPFam'] = True else: hmm = None ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list) else: tm = None ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in seqs: sx += 1 name = seq.shortName() self.printLog( '#SEQ', 'Processing %s (%s aa) %s...' % (seq.shortName(), rje.integerString( seq.aaLen()), seq.info['Description'][:50])) try: ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## utmp = 'tmp%s.%s' % (rje.randomString(5), seq.info['AccNum']) open('%s.fas' % utmp, 'w').write( '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence'])) udata = { 'CC': ['-!- Features generated using unifake.py'], 'AC': [] } if seq.info['SpecCode'] in ['Unknown', 'UNK']: seq.info['SpecCode'] = self.info['SPCode'] #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']] #!# Check how well this works. Add spectable? #!# ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['EnsDat'] and rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']): details = rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']) self.addAlias(seq.info['AccNum'], details[0]) self.addAlias(seq.info['AccNum'], details[1]) self.addAlias(seq.info['AccNum'], details[2]) udata['GN'] = [details[2]] for id in [seq.shortName(), seq.info['AccNum']]: if id in self.dict['Aliases']: udata['AC'].append( '%s;' % string.join(self.dict['Aliases'][id], '; ')) ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ft = [] # List of features for sequence for id in [ seq.shortName(), seq.info['AccNum'], seq.info['ID'] ]: if id in self.dict['Features']: ft += self.dict['Features'][id] ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'disorder' in self.list['UniFake']: try: seq.disorder() dis = seq.obj['Disorder'] for disorder in seq.obj['Disorder'].list[ 'RegionDisorder']: ft.append({ 'Type': 'DISORDER', 'Desc': 'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': disorder[0], 'End': disorder[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) for fold in seq.obj['Disorder'].list['RegionFold']: ft.append({ 'Type': 'ORDER', 'Desc': 'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': fold[0], 'End': fold[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) except: self.log.errorLog( 'UniFake disorder problem for %s.' % name) ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if hmm: try: hmm.setInfo({ 'SearchDB': '%s.fas' % utmp, 'HMMOut': '%s.hmm.out' % utmp }) # This will be made for each sequence hmm.search = [] hmm.list['HMMRes'] = [ hmm.hmmSearch(self.info['PFam'], outfile=hmm.info['HMMOut']) ] # Used in hmmTable hmm.hmmTable(outfile=hmmfile, append=True) if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list[ 'ResidueDisorder'] # individual (IUPRed) residue results else: disorder = [] if hmm.search: udata['CC'].append( 'PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'], time.ctime( os.path.getmtime(self.info['PFam'])))) else: udata['CC'].append( '-!- ERROR: PFam HMMer Search failure!') out = {'Type': '!ERROR!', 'Name': name} rje.delimitedFileOutput( self, hmmfile, [ 'Type', 'Name', 'Start', 'End', 'Eval', 'Score' ], datadict=out) for search in hmm.search: for hit in search.hit: for aln in hit.aln: pfamft = { 'Start': aln.stat['SbjStart'], 'End': aln.stat['SbjEnd'], 'Type': 'PFAM', 'Desc': '%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'], aln.stat['Expect'], aln.stat['BitScore']) } if disorder: region = disorder[ aln.stat['SbjStart'] - 1:aln.stat['SbjEnd']] hmmdisorder = float( sum(region)) / len(region) pfamft[ 'Desc'] = '%s; IUPRed: %.2f' % ( pfamft['Desc'], hmmdisorder) if hmmdisorder < self.stat[ 'DisDom']: pfamft['Type'] = 'DOMAIN' ft.append(pfamft) except: self.log.errorLog( 'UniFake PFam HMM problem for %s.' % name) ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'tmhmm' in unifake: try: tmdat = os.popen( '%s %s.fas -short' % (self.info['TMHMM'], utmp)).readlines() domlist = rje_tm.domainList( rje_tm.parseTMHMM(tmdat[0])) for tmdom in domlist: ft.append(tmdom) ft[-1]['Desc'] = 'TMHMM topology prediction' ft[-1]['Start'] = string.atoi(ft[-1]['Start']) ft[-1]['End'] = string.atoi(ft[-1]['End']) if len(domlist) > 1: udata['CC'].append( 'TMHMM: %d TM domains; N-Term %s' % ((len(domlist) - 1) / 2, domlist[0]['Type'])) else: udata['CC'].append('TMHMM: 0 TM domains') except: self.log.errorLog('UniFake TMHMM problem for %s.' % name) ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: try: os.system( '%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'], utmp, utmp)) tm.signalp = {} tm.parseSignalP('%s.signalp' % utmp) sigp = tm.signalp.pop(seq.shortName()) cpos = 0 if sigp['nn_ymax?'] == 'Y': cpos = string.atoi(sigp['nn_ymaxpos']) desc = 'SignalP NN prediction' if sigp['hmm_cmax?'] == 'Y': hmm_c = string.atoi(sigp['hmm_cmaxpos']) if cpos == 0: cpos = hmm_c desc = 'SignalP HMM prediction' else: if hmm_c < cpos: cpos = hmm_c desc = 'SignalP HMM prediction (NN also Y)' else: desc += ' (HMM also Y)' if cpos > 0: ft.append({ 'Type': 'SIGNALP', 'Desc': desc, 'Start': 1, 'End': cpos }) except: self.log.errorLog( 'UniFake SignalP problem for %s.' % name) ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.addRealUniProt(seq, udata, ft) self.deBug(ft) if not store: uniprot.list['Entry'] = [] if uniprot.addFromSeq( seq, data=udata, ft=ft): ### Converts into UniProtEntry object if not store: uniprot.saveUniProt(datfile, append=True) #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName()) ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.log.errorLog('Problem during UniFake(%s)' % name) for tmp in glob.glob('%s*' % utmp): os.unlink(tmp) self.printLog( '#UNIFAKE', '|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx), rje.integerString(seqnum - sx)), log=False) if store: uniprot.saveUniProt(datfile, append=False) if self.opt['CleanUp']: for tmp in glob.glob('TMHMM*'): if os.path.isdir(tmp): os.rmdir(tmp) except: self.errorLog( 'Oh, the shame of it! Trouble during UniFake.uniFake()')
def rfAtt(self): ### Generic method ''' Generic method. Add description here (and arguments.) ''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhead = [ 'Att', 'RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3', 'ObsRF1', 'ObsRF2', 'ObsRF3', 'ObsRF-1', 'ObsRF-2', 'ObsRF-3', 'ExpRF1', 'ExpRF2', 'ExpRF3', 'ExpRF-1', 'ExpRF-2', 'ExpRF-3' ] rfdata = {} rfobs = {} rfexp = {} ntfreq = {} for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']: rfdata[rf] = {} rfobs[rf] = {} rfexp[rf] = {} for x in rje_seq.alph_protx[:-1] + ['*']: rfdata[rf][x] = 0 rfobs[rf][x] = 0 rfexp[rf][x] = 0 for a1 in rje_seq.alph_protx[:-1] + ['*']: for a2 in rje_seq.alph_protx[:-1] + ['*']: rfdata[rf]['%s%s' % (a1, a2)] = 0 rfobs[rf]['%s%s' % (a1, a2)] = 0 rfexp[rf]['%s%s' % (a1, a2)] = 0 for x in rje_seq.alph_dna[:-1]: ntfreq[x] = 0 seqlist = self.obj['SeqList'] ### ~ [2] Count sequence attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (sx, stot) = (0.0, seqlist.seqNum()) for seq in seqlist.seq: self.progLog( '\r#ATT', 'Counting sequence attributes: %.2f%%' % (sx / stot)) sx += 100.0 for x in seq.info['Sequence']: if x in ntfreq: ntfreq[x] += 1 rf6 = rje_sequence.sixFrameTranslation(seq.info['Sequence']) for r in rf6: rseq = rf6[r] rf = 'RF%d' % r for i in range(len(rseq)): a = rseq[i] dia = rseq[i:i + 2] if a in rfdata[rf]: rfdata[rf][a] += 1 if dia in rfdata[rf]: rfdata[rf][dia] += 1 self.printLog('\r#ATT', 'Counting sequence attributes complete.') ### ~ [3] Calculate Observed & Expected ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ntobs = rje.dictFreq(ntfreq, total=True, newdict=True) ntcomp = {'Total': ntobs['Total']} for xy in ['AT', 'GC']: ntcomp[xy[0]] = ntobs[xy[1]] ntcomp[xy[1]] = ntobs[xy[0]] for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']: aafreq = {} for a in rje_seq.alph_protx[:-1] + ['*']: aafreq[a] = rfdata[rf][a] aafreq = rje.dictFreq(aafreq, total=True, newdict=True) for a in rje_seq.alph_protx[:-1] + ['*']: rfobs[rf][a] = rfdata[rf][a] rfexp[rf][a] = 0 for n1 in 'GATC': for n2 in 'GATC': for n3 in 'GATC': codon = '%s%s%s' % (n1, n2, n3) aa = rje_sequence.dna2prot(codon) if rf[-2] == '-': rfexp[rf][aa] += (int(ntobs['Total'] / 3.0) * ntcomp[n1] * ntcomp[n2] * ntcomp[n3]) else: rfexp[rf][aa] += (int(ntobs['Total'] / 3.0) * ntobs[n1] * ntobs[n2] * ntobs[n3]) #self.deBug('%s: %s x %s x %s x %s' % (aa,(ntobs['Total'] - 2), rfobs[rf][n1], rfobs[rf][n2], rfobs[rf][n3])) #self.deBug('%s: %s' % (aa,rfexp[rf][aa])) for a1 in rje_seq.alph_protx[:-1] + ['*']: for a2 in rje_seq.alph_protx[:-1] + ['*']: rfexp[rf]['%s%s' % (a1, a2)] = (aafreq['Total'] - 1) * aafreq[a1] * aafreq[a2] rfobs[rf]['%s%s' % (a1, a2)] = rfdata[rf]['%s%s' % (a1, a2)] ### ~ [4] Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfile = rje.baseFile(seqlist.info['Name']) + '.rf.tdt' rje.delimitedFileOutput(self, rfile, rfhead, rje_backup=True) for a in rje_seq.alph_protx[:-1] + ['*']: data = {'Att': a} for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']: data['Obs%s' % rf] = rfobs[rf][a] data['Exp%s' % rf] = '%.2f' % rfexp[rf][a] data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a]) rje.delimitedFileOutput(self, rfile, rfhead, datadict=data) for a1 in rje_seq.alph_protx[:-1] + ['*']: for a2 in rje_seq.alph_protx[:-1] + ['*']: a = '%s%s' % (a1, a2) data = {'Att': a} for rf in ['RF1', 'RF2', 'RF3', 'RF-1', 'RF-2', 'RF-3']: data['Obs%s' % rf] = rfobs[rf][a] data['Exp%s' % rf] = '%.2f' % rfexp[rf][a] data[rf] = rje.expectString(rfobs[rf][a] / rfexp[rf][a]) rje.delimitedFileOutput(self, rfile, rfhead, datadict=data) self.printLog('#TDT', 'TDT output complete.') except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['SeqList'] = rje_seq.SeqList(self.log,['keepblast=T']+self.cmd_list+['autofilter=F','align=F','haqbat=None']) self.obj['SeqList']._checkForDup(True) if not self.seqNum(): self.errorLog('No sequences loaded!',printerror=False); return False if self.opt['AddQueries'] and self.name() not in self.obj['SeqList'].list['Blast2Fas']: self.obj['SeqList'].list['Blast2Fas'].append(self.name()) ### ~ [2] Setup Results Directory ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['HaqDir'].lower() in ['','none']: self.info['HaqDir'] = '%s_HAQESAC/' % rje.baseFile(self.name(), strip_path=True) rje.mkDir(self,self.info['HaqDir']) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def slimDisc(self): ### Runs SLiMDisc on batch of files '''Runs SLiMDisc on batch of files.''' try: ### Setup ### if self.stat['MinSup'] > self.stat['SlimSupport'] and self.stat['SlimSupport'] > 1: self.stat['MinSup'] = self.stat['SlimSupport'] if self.stat['MaxSup'] > 0 and self.stat['MaxSup'] < self.stat['SlimSupport'] and self.stat['SlimSupport'] > 1: self.stat['MaxSup'] = self.stat['SlimSupport'] ### Make File List ## _stage = 'Make File List' if self.info['SeqIn'].lower() not in ['','none']: if os.path.exists(self.info['SeqIn']): gfiles = [self.info['SeqIn']] else: self.log.errorLog('"seqin" file "%s" not found! No SLiMDisc analysis.' % self.info['SeqIn'],printerror=False) return False else: gfiles = rje.getFileList(callobj=self,filelist=self.list['SlimFiles'],subfolders=False,summary=False) self.log.printLog('#FILES','%s files identified for SLiMDisc analysis.' % rje.integerString(len(gfiles))) ## Sort by size and filter by MinSup and MaxSup ### datasize = {} # Dictionary for crude sorting of files by total AA content seqnum = {} # Number of sequences in each file qry = {} # Query sequence name (if any) for file tmpseq = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None','autofilter=F']) gx = 0 while gx < len(gfiles): seqfilename = gfiles[gx] gx += 1 seqfile = seqfilename[0:] tmpseq.seq = [] tmpseq.loadSeqs(seqfile) ## *** Special RemHub process *** ## checkhub = True for hubtype in ['rem','kept','no']: if seqfile.find('-%shub.fas' % hubtype) > 0: checkhub = False if self.stat['RemHub'] > 0.0 and checkhub: if rje.matchExp('(\S+)_PPI',seqfile): hub_acc = rje.matchExp('(\S+)_PPI',rje.baseFile(seqfile,strip_path=True))[0] else: hub_acc = rje.baseFile(seqfile,strip_path=True) hub_base = rje.matchExp('(\S+)%s' % hub_acc,seqfilename)[0] basefile = seqfile while rje.baseFile(basefile) != basefile: basefile = rje.baseFile(basefile) if tmpseq.querySeq(query=hub_acc): ### Sets Hub as Query Sequence self.log.printLog('#HUB','Removing hub protein %s and >=%.1f%% ID from PPI dataset %s.' % (hub_acc,self.stat['RemHub'],seqfile)) tmpseq.makeNR(text='Hub protein homologues',nrid=self.stat['RemHub'],blast=tmpseq.seqNum(),nrsim=0,nr_qry=tmpseq.obj['QuerySeq']) tmpseq.removeSeq(text='PPI Hub Protein (self-interactor)',seq=tmpseq.obj['QuerySeq']) tmpseq.obj['QuerySeq'] = None seqfile = '%s-remhub.fas' % basefile tmpseq.saveFasta(seqfile=seqfile) ### Saves sequences in fasta format keptfile = '%s-kepthub.fas' % basefile os.rename(seqfilename,keptfile) gfiles.append(keptfile) else: seqfile = '%s-nohub.fas' % basefile os.rename(seqfilename,seqfile) self.log.printLog('#HUB','Hub protein %s not in PPI dataset %s => %s.' % (hub_acc,seqfilename,seqfile)) #X#print tmpseq.obj['QuerySeq'] ## Support Range ### if tmpseq.seqNum() < self.stat['MinSup'] or (self.stat['MaxSup'] > 0 and tmpseq.seqNum() > self.stat['MaxSup']): self.log.printLog('#REJ','%s rejected: %s sequences = outside acceptable range of %d-%d.' % (seqfile,rje.integerString(tmpseq.seqNum()),self.stat['MinSup'],self.stat['MaxSup'])) continue aasize = tmpseq.aaCount() self.log.printLog('#AA','%s = %s aa.' % (seqfile,rje.integerString(aasize))) while datasize.has_key(aasize): aasize += 1 datasize[aasize] = seqfile seqnum[seqfile] = tmpseq.seqNum() ## Query ## qry[seqfile] = None if self.opt['SlimQuery']: if rje.matchExp('qry_(\S+)\.',seqfilename): if tmpseq.querySeq(query=rje.matchExp('qry_(\S+)\.',seqfilename)[0]): ### Sets Query Sequence if appropriate qry[seqfile] = tmpseq.obj['QuerySeq'].shortName() self.log.printLog('#INF','%s Datasets to process.' % rje.integerString(len(seqnum))) ### Batch Output Mode ### batchout = None if self.info['BatchOut'].lower() not in ['','none']: batchout = self.info['BatchOut'] if not self.opt['Append'] and os.path.exists(batchout): rje.backup(self,batchout) ### Work through Files ### _stage = 'Work through files' for key in rje.sortKeys(datasize,revsort=self.opt['BigFirst']): seqfile = datasize[key] basefile = seqfile while rje.baseFile(basefile) != basefile: basefile = rje.baseFile(basefile) base = rje.baseFile(basefile,True) self.log.printLog('#DAT',seqfile,timeout=False) if not self.opt['UseRes']: slim_cmd = '-BT -TT' else: ## Detect old files ## _stage = 'Detect old files' old_rank = '%s/%s.rank' % (basefile,base) self.log.printLog('#RES','Existing SLiMDisc Output?: %s' % (os.path.exists(old_rank))) old_b_list = glob.glob('%s/results/*.blastp' % basefile) old_t_file = '%s/%s.fasta.out' % (basefile,base) self.log.printLog('#RES','Existng TEIRESIAS Output?: %s' % (os.path.exists(old_t_file))) self.log.printLog('#RES','%s of %s BLAST files detected.' % (rje.integerString(len(old_b_list)),rje.integerString(seqnum[seqfile]))) ## TEIRESIAS ## if (os.path.exists(old_rank) or len(old_b_list) > 0) and os.path.exists(old_t_file): # BLAST started: TEIRESIAS finished! slim_cmd = '-TF' else: slim_cmd = '-TT' ## BLAST ## if len(old_b_list) != seqnum[seqfile]: # Need BLAST slim_cmd += ' -BT' else: slim_cmd += ' -BF' ## Query ## if self.opt['SlimQuery'] and qry[seqfile]: slim_cmd += ' -q %s' % qry[seqfile] ## Ranks ## slim_cmd += ' -n %d' % self.stat['SlimRanks'] ## Support ## if self.stat['SlimSupport'] > 0 and self.stat['SlimSupport'] < 1: slim_cmd += ' -S %.1f' % self.stat['SlimSupport'] elif self.stat['SlimSupport'] > 0: slim_cmd += ' -S %d' % self.stat['SlimSupport'] ## WallTime ## slim_cmd += ' -W %d' % self.stat['SlimWall'] ## MemSaver ## if self.opt['MemSaver']: slim_cmd += ' -X T' else: slim_cmd += ' -X F' ## SlimOpt ## if self.info['SlimOpt']: slim_cmd += ' %s' % self.info['SlimOpt'] ## Perform SLiMDisc Run ## _stage = 'Peform SLiMDisc Run (%s)' % (seqfile) if batchout: BATCH = open(batchout,'a') BATCH.write('%s -i %s -Q0 %s\n' % (self.info['SlimCall'],seqfile,slim_cmd)) BATCH.close() else: if self.stat['Verbose'] > 0: syscmd = 'python /home/richard/Python_Modules/slimdisc_V%s.py -i %s -Q2 %s' % (self.info['SlimVersion'],seqfile,slim_cmd) else: syscmd = 'python /home/richard/Python_Modules/slimdisc_V%s.py -i %s -Q0 %s' % (self.info['SlimVersion'],seqfile,slim_cmd) self.log.printLog('#SYS',syscmd) os.system(syscmd) if not batchout: new_rank = '%s/%s.rank' % (basefile,base) self.log.printLog('#RES','New rank result %s produced?: %s' % (new_rank,os.path.exists(new_rank))) except: self.log.errorLog('rje_pattern_discovery banjaxed in slimDisc() %s' % _stage,quitchoice=True)
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.getStrLC('Basefile'): self.basefile(rje.baseFile(self.getStr('SeqIn'))) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self.prog()); return False # Setup failed
def run(self): ### Main Run method ''' Main Run method. ''' try: ### SLiMDisc Run ### if self.opt['SLiMDisc']: return self.slimDisc() ### TEIRESIAS ### if self.opt['Teiresias']: ## Setup ## seqlist = rje_seq.SeqList(self.log,self.cmd_list) infile = '%s.teiresias.fas' % rje.baseFile(seqlist.info['Name'],True) outfile = '%s.teiresias.out' % rje.baseFile(seqlist.info['Name'],True) run_teiresias = True if rje.isYounger(outfile,infile) == outfile: if self.stat['Interactive'] < 1 or not rje.yesNo('%s and %s exist already. Regenerate?' % (infile,outfile),'N'): run_teiresias = False ## Run TEIRESIAS ## if run_teiresias: seqlist.saveFasta(seqfile=infile,name='Teiresias') ### Saves sequences in fasta format command = rje.makePath(self.info['TeiresiasPath'],True) command += ' -i%s -o%s %s' % (infile,outfile,self.info['TeiresiasOpt']) self.log.printLog('#CMD',command) os.system(command) ## Read Results ## self.verbose(0,2,'Reading TEIRESIAS output from %s...' % outfile,1) self.list['Pattern'] = [] RESULTS = open(outfile,'r') line = RESULTS.readline() while line: if rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line): # New pattern self.addTeiresiasPattern(rje.matchExp('^(\d+)\s+(\d+)\s+(\S+)\s+(\d.+\d)$',line)) elif len(line) > 3 and line[0] != '#': self.log.errorLog('Did not recognise line: %s' % line,False,False) line = RESULTS.readline() RESULTS.close() patx = len(self.list['Pattern']) self.log.printLog('#PAT','%s TEIRESIAS patterns read from %s.' % (rje.integerString(patx),outfile)) ## Calculate Information Content ## aafreq = seqlist.aaFreq() self.verbose(0,3,'Calculating Information Content & Length stats...',0) occx = 0 for pattern in self.list['Pattern']: pattern.stat['Info'] = self.calculateScore(pattern.info['Pattern'],aafreq) pattern._makeLength() occx += 1 rje.progressPrint(self,occx,patx/100,patx/10) self.verbose(0,1,'...Done!',2) ## Prepare Results ## delimit = rje.getDelimit(self.cmd_list) if self.info['Name'] == 'None': self.info['Name'] = '%s.teiresias.%s' % (rje.baseFile(seqlist.info['Name'],True),rje.delimitExt(delimit)) if self.opt['MySQL']: # Two tables patfile = os.path.splitext(self.info['Name']) occfile = '%s.occ%s' % (patfile[0],patfile[1]) patfile = '%s.patterns%s' % (patfile[0],patfile[1]) if self.opt['Append']: PATFILE = open(patfile,'a') OCCFILE = open(occfile,'a') else: PATFILE = open(patfile,'w') rje.writeDelimit(PATFILE,['pattern','tot_occ','seq_occ','info','len','fix','wild'],delimit) OCCFILE = open(occfile,'a') rje.writeDelimit(OCCFILE,['seq_id','pos','pattern','pat_match'],delimit) else: if self.opt['Append']: RESFILE = open(self.info['Name'],'a') else: RESFILE = open(patfile,'w') rje.writeDelimit(RESFILE,['Sequence Name','Position','Pattern','Match','Total Occurrences','Num Sequences','Information Content','Length','Fixed','Wildcard'],delimit) ## Save Results ## occx = 0 for pattern in self.list['Pattern']: patstats = [] for stat in ['OccCount','SeqCount','Info','Length','Fixed','Wildcards']: patstats.append('%d' % pattern.stat[stat]) patstats[2] = '%.3f' % pattern.stat['Info'] if self.opt['MySQL']: # Two tables rje.writeDelimit(PATFILE,[pattern.info['Pattern']] + patstats,delimit) for occ in rje.sortKeys(pattern.occ): seq = seqlist.seq[occ] for pos in pattern.occ[occ]: match = seq.info['Sequence'][pos:(pos+pattern.stat['Length'])] outlist = [seq.shortName(),'%d' % pos,pattern.info['Pattern'],match] if self.opt['MySQL']: # Two tables rje.writeDelimit(OCCFILE,outlist,delimit) else: rje.writeDelimit(RESFILE,outlist+patstats,delimit) occx += 1 if self.opt['MySQL']: # Two tables PATFILE.close() OCCFILE.close() self.log.printLog('#OUT','%s patterns output to %s.' % (rje.integerString(patx),patfile)) self.log.printLog('#OUT','%s pattern occurrences output to %s.' % (rje.integerString(occx),occfile)) else: RESFILE.close() self.log.printLog('#OUT','%s occurrences of %s patterns output to %s.' % (rje.integerString(occx),rje.integerString(patx),self.info['Name'])) ### InfoContent ### elif self.info['Info'] != 'None': ## Setup ## alphabet = rje_seq.alph_protx if not os.path.exists(self.info['Info']): self.log.errorLog('Input file %s missing!' % self.info['Info'],False,False) return False else: mypresto = presto.Presto(self.log,self.cmd_list) mypresto.loadMotifs(file=self.info['Info'],clear=True) seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T']) if seqlist.seqNum() > 0: aafreq = seqlist.aaFreq(alphabet=None,fromfile=None,loadfile=None,total=False) ### Returns dictionary of AA (& gap etc.) frequencies else: aafreq = {} for aa in alphabet: aafreq[aa] = 1.0 / len(alphabet) alphabet = aafreq.keys() maxinfo = 0 for aa in alphabet: maxinfo += (aafreq[aa] * math.log(aafreq[aa],2)) ## Output ## delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) outfile = '%s.info.%s' % (rje.baseFile(self.info['Info'],True,['.txt','.%s' % ext]),ext) if self.opt['Append']: OUTFILE = open(outfile,'a') else: OUTFILE = open(outfile,'w') rje.writeDelimit(OUTFILE,['motif','pattern','info'],delimit) ## Calculate Information Scores ## for motif in mypresto.motif: self.verbose(2,4,motif.info['Sequence'],0) pattern = string.replace(motif.info['Sequence'],'X','.') elements = string.split(pattern,'-') pattern = '' for el in elements: if el.find('.{') == 0: # Ambiguous spacer length - compress pattern += '.' else: pattern += el self.verbose(2,2,'=> %s' % pattern,1) motif.stat['Info'] = self.calculateInformationContent(pattern,aafreq,maxinfo,self.stat['InfoGapPen']) self.verbose(0,3,'%s (%s) = %.2f' % (motif.info['Name'],pattern,motif.stat['Info']),1) ## Output ## rje.writeDelimit(OUTFILE,[motif.info['Name'],pattern,'%.2f' % motif.stat['Info']],delimit) ## Finish ## OUTFILE.close() except: self.log.errorLog('Error in run().',printerror=True,quitchoice=False) raise # Delete this if method error not terrible
def run(self): ### Main run method '''Main run method.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### infile = self.getStr('InFile') while not rje.exists(infile): infile = rje.choice( 'File "%s" not found. Input file name? (Blank to quit):' % infile) if not infile: return self.printLog('#QUIT', 'Execution terminated!') db = rje_db.Database(self.log, self.cmd_list) db.basefile(rje.baseFile(infile)) sdb = db.addTable(infile, mainkeys='#', delimit='\t', name='SPF.Mod') levels = { 'Level_1': 'k', 'Level_2': 'p', 'Level_3': 'c', 'Level_4': 'o', 'Level_5': 'f', 'Level_6': 'g', 'Level_7': 's' } # k__Bacteria p__Proteobacteria c__Alphaproteobacteria o__Rhodospirillales f__Rhodospirillaceae g__ s__ denovo44 # Unassigned unclassified unclassified unclassified unclassified unclassified unclassified denovo49 ### ~ [1] Modify Text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dupnames = [] parents = {} # Parent for each term renamed = [] ex = 0.0 etot = sdb.entryNum() for entry in sdb.entries(): self.progLog('\r#SPF', 'Modifying SPF content: %.1f%%' % (ex / etot)) ex += 100.0 taxon = '' parent = '' #self.debug(entry) for lvl in [ 'Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_5', 'Level_6', 'Level_7' ]: entry[lvl] = string.replace(entry[lvl], 'unidentified', 'unclassified') #entry[lvl] = string.replace(entry[lvl],'Incertae_sedis','Incertae_sedis-%s' % levels[lvl]) null = '%s__' % levels[lvl] #self.bugPrint(null) #self.bugPrint(entry[lvl]) if entry[lvl] in [ null, 'Unassigned', 'unclassified', '%sunclassified' % null, '%sunidentified' % null, '%sunculturedfungus' % null, '%sIncertae_sedis' % null, '%sunclassified_sp.' % null ]: if not taxon or taxon.endswith('unclassified'): entry[lvl] = '%sunclassified' % null #elif taxon.endswith('unassigned)'): entry[lvl] = '%s%s' % (null,taxon[3:]) #elif taxon.endswith('unassigned)'): entry[lvl] = '%s(%s;%s-unassigned)' % (null,string.split(taxon,'(')[1][:-1],levels[lvl]) elif taxon.endswith('unassigned)'): entry[lvl] = '%s%s;%s-unassigned)' % ( null, taxon[3:][:-1], levels[lvl]) else: entry[lvl] = '%s%s(%s-unassigned)' % ( null, taxon[3:], levels[lvl]) if entry[lvl] in parents: #self.debug(parents[entry[lvl]]) if parent in parents[entry[lvl]]: entry[lvl] = parents[entry[lvl]][parent] else: self.bugPrint(entry[lvl]) self.bugPrint(parents[entry[lvl]]) renamed.append(entry[lvl]) newtax = '%s%d' % (entry[lvl], renamed.count(entry[lvl])) self.warnLog( '%s had multiple parents (%s & %s) -> %s' % (entry[lvl], string.join(parents[entry[lvl]], '|'), parent, newtax)) parents[newtax] = {parent: newtax} parents[entry[lvl]][parent] = newtax entry[lvl] = newtax self.deBug(parents[entry[lvl]]) elif parent: parents[entry[lvl]] = {parent: entry[lvl]} parent = entry[lvl] if entry[lvl][3:] == taxon[3:]: if (entry[lvl], taxon) not in dupnames: dupnames.append((entry[lvl], taxon)) #self.bugPrint(entry[lvl]) taxon = entry[lvl] #self.debug(entry) #self.debug(parents) self.printLog('\r#SPF', 'Modifying SPF content complete.') dupnames.sort() for (dupA, dupB) in dupnames: self.warnLog('Duplicate taxa names: %s & %s' % (dupA, dupB)) ### ~ [2] Save to file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sdb.saveToFile(savefields=sdb.list['Fields'][1:]) ### ~ [3] Compress to different taxonomic levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### compress = [ 'Level_1', 'Level_2', 'Level_3', 'Level_4', 'Level_5', 'Level_6', 'Level_7', '#' ] dump = compress.pop(-1) rules = {'Observation Ids': 'list', dump: 'str'} sdb.dropField('Observation Ids') while compress: sdb.compress(compress, rules=rules, default='sum', best=[], joinchar='|') #if dump == '#': sdb.dropField(dump) sdb.saveToFile( '%s.SPF.%s.%s.spf' % (rje.baseFile(infile), compress[-1], levels[compress[-1]])) dump = compress.pop(-1) rules[dump] = 'list' return except: self.errorLog(self.zen()) raise # Delete this if method error not terrible
def runMain(): try: ### <0> ### Basic Setup of Program [info, out, mainlog, cmd_list] = setupProgram() ### <1> ### Load Data ## <a> ## Read in Sequences try: out.verbose(1, 3, 'Loading sequences...', 0) seqfile = 'infile.fas' nsfin = None for cmd in cmd_list: if cmd.find('seqin=') == 0: seqfile = cmd[len('seqin='):] if cmd.find('nsfin=') == 0: nsfin = cmd[len('nsfin='):] basefile = seqfile extension = seqfile[-4:] if (extension == '.fas') or (extension == '.phy') or (extension == '.aln'): basefile = seqfile[:-4] seqs = rje_seq.SeqList( log=mainlog, cmd_list=['i=0'] + cmd_list + ['autofilter=F', 'autoload=F', 'seqin=None']) out.verbose(1, 3, "from %s" % seqfile, 1) if not seqs.loadSeqs(seqfile=seqfile, seqtype='protein', aln=True): raise seqfile = seqs.info['Name'] basefile = rje.baseFile(seqfile) mainlog.printLog( '#SEQ', "%s protein sequences read from %s\n" % (str(seqs.seqNum()), seqfile), 1) mainlog.printLog( '#SEQ', "Alignment = %s. (%d aa)\n" % (seqs.opt['Aligned'], seqs.seq[0].seqLen()), 1) except: mainlog.errorLog("Fatal run Exception during Sequence Input\n") raise ## <b> ## Read in Tree try: if not nsfin: nsfin = basefile + '.nsf' while not os.path.exists(nsfin): if out.stat['Interactive'] >= 0: nsfin = rje.choice( text= 'Input tree file "%s" not found. Input filename? (Blank to exit.)' % nsfin) if nsfin == '': raise KeyboardInterrupt else: mainlog.log.errorLog( 'File %s not found. Cannot load tree!' % nsfin, printerror=False, quitchoice=True) raise cmd_list.append('nsfin=' + nsfin) out.verbose(1, 3, 'Loading tree from %s...' % nsfin, 1) mytree = rje_tree.Tree(log=mainlog, cmd_list=['root=yes'] + cmd_list) mytree.mapSeq(seqlist=seqs) mytree.textTree() if mytree.opt['ReRooted']: mytree.saveTree(filename='%s.nsf' % basefile) except KeyboardInterrupt: mainlog.errorLog("User terminated.\n") raise except: mainlog.errorLog("Fatal run Exception during Tree Input\n") raise ### <2> ### GASP try: ## <a> ## InDel Tree Setup indeltree = None for cmd in cmd_list: if cmd.find('indeltree=') == 0: indeltree = cmd[len('indeltree='):] ## <b> ## GASP if indeltree == None or mytree.node[-1].obj[ 'Sequence'] == None: # Perform GASP out.verbose(0, 2, '', 3) mainlog.printLog('#SEQ', 'GASP: Gapped Ancestral Sequence Prediction', 1) if basefile == 'infile': basefile = 'gasp' mygasp = rje_ancseq.Gasp(tree=mytree, ancfile='%s' % basefile, cmd_list=cmd_list, log=mainlog) out.verbose(0, 2, '%s' % mygasp.details(), 1) if out.stat['Interactive'] > 0: if rje.yesNo('Use these parameters?') == False: mygasp.edit() mygasp.gasp() out.verbose(0, 1, "\n\nGASP run completed OK!", 2) ## <c> ## InDel Tree if indeltree: mytree.indelTree(filename=indeltree) except KeyboardInterrupt: mainlog.errorLog("User terminated.\n") raise except: mainlog.errorLog("Fatal run Exception during GASP\n") raise ### <X> ### End except KeyboardInterrupt: mainlog.errorLog("User terminated.\n") except: print "Unexpected error:", sys.exc_info()[0] mainlog.printLog( '#LOG', "%s V:%s End: %s\n" % (info.program, info.version, time.asctime(time.localtime(time.time()))), 1)
def splitMascot(self): ### Reads the MASCOT file and splits into header, hits and unmatched files. '''Reads the MASCOT file and splits into header, hits and unmatched files.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### db = self.db() infile = self.getStr('MASCOT') if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('MASCOT'))) #x#self.deBug(self.basefile()) headfile = '%s.header.txt' % self.basefile() hitsfile = '%s.mascot.csv' % self.basefile() peptfile = '%s.nohits.csv' % self.basefile() if rje.isYounger(self.getStr('MASCOT'),hitsfile) == hitsfile and not self.force(): return self.printLog('#FILE','%s file found (force=F)' % hitsfile) ### ~ [1] Split MASCOT~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headlines = [] csvhead = [] mdb = None mx = 0 itraq = [] prot_data = {} for mline in open(self.getStr('MASCOT'),'r').readlines(): mx += 1 # Index of next line in case needed for iTRAQ reading! ## ~ [1a] Skip down until Header found ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not headlines and mline.find('Header') < 0: continue ## ~ [1b] Add Header lines to headlines until results headers found ~~~~~~~~~~~~~~~ ## if not csvhead and mline.find('prot_hit_num') < 0: headlines.append(mline); continue ## ~ [1c] Sort out MASCOT results headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if mline.find('prot_hit_num') >= 0: ## ~ Read Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## open(headfile,'w').writelines(headlines) csvhead = rje.readDelimit(string.join(string.split(rje.chomp(mline))),',') while '' in csvhead: csvhead.remove('') ## ~ Sort out iTRAQ headers (missing) ~~~~~~~~~ ## if self.getBool('iTRAQ'): iline = open(self.getStr('MASCOT'),'r').readlines()[mx] for isplit in rje.readDelimit(iline,',')[len(csvhead):]: # Should be start of iTRAQ data if '/' in isplit: itraq.append(isplit) self.printLog('#ITRAQ',string.join(itraq)) csvhead += itraq idb = db.addEmptyTable('itraq',['prot_hit_num','prot_acc','prot_desc','itraq','ratio','n','geomean','summary'],keys=['prot_hit_num','itraq']) idb.info['Delimit'] = ',' ## ~ Add emPAI header (also missing) ~~~~~~~~~~ ## if self.getBool('emPAI'): csvhead.append('empai') ## ~ Set up Database Table ~~~~~~~~~~~~~~~~~~~~ ## self.printLog('#HEAD',string.join(csvhead,'; ')) mdb = db.addEmptyTable('mascot',csvhead,keys=['prot_hit_num','pep_query']) mdb.info['Delimit'] = ',' elif mline.find('Peptide matches') >= 0: mdb.saveToFile() if self.getBool('emPAI'): csvhead.remove('empai') mdb = db.addEmptyTable('nohits',csvhead,keys=['pep_query']) for field in mdb.fields(): if field[:4] == 'prot': mdb.dropField(field) mdb.info['Delimit'] = ',' continue elif rje.chomp(mline): #self.deBug('%s ... %s' % (mline[:20],mline.find('Peptide matches'))) data = rje.readDelimit(mline,',') entry = {}; pretraq = True #self.deBug(csvhead); self.deBug(itraq); for d in range(len(csvhead)+len(itraq)): if d >= len(data): break if data[d] in itraq: dhead = data[d]; pretraq = False elif data[d] == 'emPAI': entry['empai'] = data[d+1]; pretraq = False elif pretraq and d < len(csvhead): dhead = csvhead[d] elif pretraq: continue # Unmatched peptides will not have emPAI or iTRAQ data #self.deBug('%s > %s' % (data[d],dhead)) if d and data[d-1] == 'emPAI': continue elif data[d] in itraq + ['emPAI']: continue elif dhead not in entry: entry[dhead] = data[d] #self.deBug('%s = %s' % (dhead,entry[dhead])) if entry['prot_acc']: prot_data[entry['prot_hit_num']] = {'prot_acc':entry['prot_acc'],'prot_desc':entry['prot_desc']} if self.getBool('iTRAQ') and 'Quantitation summary for protein' in data: d = data.index('Quantitation summary for protein') + 1 if entry['prot_hit_num'] in prot_data: pacc = prot_data[entry['prot_hit_num']]['prot_acc'] pdesc = prot_data[entry['prot_hit_num']]['prot_desc'] else: pacc = entry['prot_acc'] pdesc = entry['prot_desc'] while d < len(data): if data[d] in itraq: idb.addEntry({'prot_hit_num':entry['prot_hit_num'],'prot_acc':pacc,'prot_desc':pdesc, 'itraq':data[d],'ratio':data[d+1],'n':data[d+2],'geomean':data[d+3],'summary':data[d+4]}) d += 1 #self.deBug(entry) if entry['prot_hit_num'] or entry['pep_query']: mdb.addEntry(entry) mdb.saveToFile() if self.getBool('iTRAQ'): idb.saveToFile() self.deBug('') return True except: self.errorLog('Error reading MASCOT file'); return False
def parse(self): ### Parse REST file into dictionaries '''Parse REST file into dictionaries.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['RestKeys'] = [] rbase = '%s%s' % (self.getStr('RestOutDir'), rje.baseFile(self.getStr('RestBase'), strip_path=True, keepext=True)) if rje.exists(self.getStr('RestIn')): restin = open(self.getStr('RestIn'), 'r').read() elif rje.matchExp('^(\d+)$', self.getStr('RestIn')): url = '%sretrieve&jobid=%s&password=%s' % (self.getStr( 'RestURL'), self.getStr('RestIn'), self.getStr('Password')) if self.getBool('PureAPI') and self.getStrLC('Rest'): url += '&rest=%s' % (self.getStr('Rest')) else: url += '&rest=full' restin = urllib2.urlopen(url).read() if self.getBool('PureAPI'): return restin else: raise IOError('%s not found!' % self.getStr('RestIn')) jobid = None ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for restdata in string.split( restin, '###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###\n' ): if not jobid: self.dict['Output']['intro'] = restdata jobid = rje.matchExp('JobID: (\d+)', restdata)[0] self.dict['Output']['jobid'] = jobid if not self.getStrLC('RestBase'): rbase = '%s%s' % (self.getStr('RestOutDir'), jobid) self.dict['Outfile']['jobid'] = '%s.jobid' % (rbase) continue restlines = string.split(restdata, '\n') rparse = string.split(restlines.pop(0)) if rparse[0] != '#': self.errorLog('REST output format error: %s' % string.join(rparse), printerror=False) continue if rparse[1][-1] != ':': self.errorLog('REST output format error: %s' % string.join(rparse), printerror=False) continue rkey = rparse[1][:-1] try: rfile = '%s.%s' % ( rbase, rje.baseFile(rparse[2], strip_path=True, keepext=True)) except: rfile = '' if not rfile: rfile = '%s.%s' % (rbase, rkey) self.dict['Output'][rkey] = string.join(restlines, '\n') self.dict['Outfile'][rkey] = rfile self.list['RestKeys'].append(rkey) self.printLog( '#PARSE', 'Parsed %s: %d REST outputs.' % (self.getStr('RestIn'), len(self.dict['Output']))) return True except: self.errorLog('%s.parse error' % self) return False